# Dat550 Project

In [1]:
import json
import string
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat import textstat
from tqdm import tqdm

num_threads = 8  # Adjust this based on the number of cores available
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
tf.config.threading.set_inter_op_parallelism_threads(num_threads)

tqdm.pandas()

2025-04-08 12:58:02.200759: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-08 12:58:02.202326: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-08 12:58:02.209401: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-08 12:58:02.222394: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744109882.245989   94583 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744109882.25

In [2]:
filepath = "preprocessing/data/articles-training-bypublisher.jsonl"

def load_json(filepath):

    articles = []
    table = str.maketrans("","",string.punctuation+"“”‘’")

    with open(filepath, "r") as f:
        for line in f:
            data = json.loads(line)
            title = data["title"].lower().translate(table)
            content = data["content"].lower().translate(table)
            articles.append({
                "id": int(data["id"]),
                "content": f"{title} {content}"
            })
    

    return pd.DataFrame(articles)



def load_ground_truth(filepath):
    return pd.read_json(filepath, orient="records", lines=True)

def merge_with_ground_truth(articles_df, ground_truth_df):
    return articles_df.merge(ground_truth_df[['id', 'hyperpartisan']], on='id', how='left')
        

In [3]:
def prepare_data(article_path, truth_path):
    print("Loading and merging data...")
    articles_df = load_json(article_path)
    ground_truth_df = load_ground_truth(truth_path)
    df = merge_with_ground_truth(articles_df, ground_truth_df)

    # Filter out samples with missing labels
    df = df.dropna(subset=['hyperpartisan'])
    df['label'] = df['hyperpartisan'].astype(int)
    return df

In [4]:
def extract_features(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)

    features = {
        'sent_neg': sentiment['neg'],
        'sent_pos': sentiment['pos'],
        'sent_compound': sentiment['compound'],
        'flesch': textstat.flesch_reading_ease(text),
        'smog': textstat.smog_index(text),
        'exclam': text.count('!'),
        'questions': text.count('?'),
        'quotes': text.count('"'),
        'length': len(text.split())
    }

    partisan_terms = {
        'far_left': ['socialist', 'progressive', 'woke'],
        'far_right': ['maga', 'conservative', 'patriot']
    }

    for group, terms in partisan_terms.items():
        features[f'count_{group}'] = sum(text.count(term) for term in terms)

    return features

In [5]:
def extract_stylometric_features(df):
    print("Extracting stylometric and sentiment features...")
    style_features = df['content'].progress_apply(extract_features)
    return pd.DataFrame(style_features.tolist())


In [6]:
def create_text_vectorizer(texts, max_tokens=1000):
    """
    Creates a text vectorizer using TensorFlow's TextVectorization layer
    for a given set of texts. This will adapt the vectorizer based on the 
    text data to prepare for tokenization.
    
    Arguments:
    - texts: List or Pandas Series of texts (articles) to process.
    - max_tokens: Maximum number of tokens for the vectorizer (vocabulary size).
    
    Returns:
    - vectorizer: The adapted TensorFlow TextVectorization layer.
    """
    print("Creating TextVectorization layer...")
    vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode="tf_idf",
        ngrams=2
    )
    
    # Batch processing to prevent memory overload
    batch_size = 1000  # Adjust this based on your system's memory
    text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    
    # Use tqdm to add a progress bar to the batch processing loop
    for batch in tqdm(text_batches, desc="Adapting Vectorizer", unit="batch", total=len(text_batches)):
        vectorizer.adapt(batch)
    
    return vectorizer


def vectorize_text_in_batches(texts, vectorizer, batch_size=10000):
    """
    Vectorizes the text in batches to manage memory usage efficiently.
    
    Arguments:
    - texts: List or Pandas Series of texts to vectorize.
    - vectorizer: The TensorFlow TextVectorization layer to use.
    - batch_size: The size of each batch to process.
    
    Returns:
    - Vectorized texts as a sparse tensor.
    """
    text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    all_vectors = []
    
    for batch in text_batches:
        # Vectorize the current batch of texts
        batch_vectorized = vectorizer(batch)
        all_vectors.append(batch_vectorized)
    
    # Concatenate all the batches together into one large sparse tensor
    all_vectors = tf.concat(all_vectors, axis=0)
    
    return all_vectors



In [7]:
def vectorize_and_combine(df, vectorizer):
    X_text = vectorizer(df["content"])
    X_style = extract_stylometric_features(df)
    X_style = X_style.astype(np.float32)
    X_all = tf.concat([X_text, tf.convert_to_tensor(X_style.values)], axis=1)
    return X_all.numpy(), X_style.columns.tolist()


In [8]:
def train_sklearn_model(model, X_train, X_test, y_train, y_test):
    print(f"\nTraining {model.__class__.__name__}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\nClassification Report ({model.__class__.__name__}):")
    print(classification_report(y_test, preds))
    return model



In [9]:
def show_top_features(model, tfidf, X_style):
    print("\nTop Predictive Features:")

    # Get the vocabulary from the TextVectorization layer
    feature_names = tfidf.get_vocabulary() + X_style
    
    # Choose the correct attribute based on model type
    if hasattr(model, "coef_"):  # Linear models like Logistic Regression
        importances = model.coef_[0]
    elif hasattr(model, "feature_importances_"):  # Tree-based models like DT or RF
        importances = model.feature_importances_
    else:
        print("This model does not support feature importance inspection.")
        return

    # Create and display feature importance dataframe
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', key=abs, ascending=False)

    display(coef_df.head(20))


In [10]:
def print_missclassified(model, X_test, y_test):
    # Get misclassified samples
    misclassified = X_test[y_test != model.predict(X_test)]
    print("Number of misclassified samples:", misclassified.shape[0])

In [11]:
# Load and process articles
df = prepare_data(
    "preprocessing/data/articles-training-bypublisher.jsonl",
    "preprocessing/data/ground-truth-training-bypublisher.jsonl"
)



Loading and merging data...


In [12]:
vecotorizer_train = create_text_vectorizer(df["content"])
# X_train, X_style_cols_train = vectorize_and_combine(df,vecotorizer_train)




Creating TextVectorization layer...


2025-04-08 13:25:29.280855: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
Adapting Vectorizer:   2%|▏         | 10/600 [00:41<41:12,  4.19s/batch]


KeyboardInterrupt: 

In [120]:
test_articles_path = "preprocessing/data/articles-test-byarticle.jsonl"
test_ground_truth_path = "preprocessing/data/ground-truth-test-byarticle.jsonl"

# Load the test data and ground truth as DataFrames
test_articles = pd.read_json(test_articles_path, orient="records", lines=True)
test_ground_truth = pd.read_json(test_ground_truth_path, orient="records", lines=True)

test_articles = test_articles.drop(columns=['hyperpartisan'])
# Merge the two DataFrames
test_data_df = pd.merge(test_articles, test_ground_truth[['id', 'hyperpartisan']], on='id')


vecotorizer_test = create_text_vectorizer(test_data_df["content"])
X_test, X_style_cols_test = vectorize_and_combine(test_data_df,vecotorizer_train)



Creating TextVectorization layer...
Extracting stylometric and sentiment features...


100%|██████████| 628/628 [00:09<00:00, 64.02it/s]


In [121]:
# Load pre-split data
# Or manually split just once and save using joblib.dump()
y_train = df["label"]
y_test = test_data_df["hyperpartisan"].astype(int)




## Logistic Regression

In [125]:
# Train & evaluate
lr_model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='liblinear'
    )

trained_lr = train_sklearn_model(lr_model,X_train,X_test,y_train,y_test)
show_top_features(trained_lr, vecotorizer_train, X_style_cols_train)
print_missclassified(trained_lr,X_test,y_test)






Training LogisticRegression...

Classification Report (LogisticRegression):
              precision    recall  f1-score   support

           0       0.52      0.94      0.67       314
           1       0.67      0.12      0.20       314

    accuracy                           0.53       628
   macro avg       0.59      0.53      0.43       628
weighted avg       0.59      0.53      0.43       628


Top Predictive Features:


Unnamed: 0,feature,importance
69,—,0.530097
958,sponsored,0.523367
296,president trump,0.471588
835,12,0.409123
407,facebook,-0.383135
37,but,0.375739
969,bill clinton,0.368534
382,far,0.367001
176,may,0.365587
162,world,-0.365413


Number of misclassified samples: 295


In [123]:
# Decision Tree
dt_model = DecisionTreeClassifier(class_weight='balanced', max_depth=12, random_state=42)
trained_dt = train_sklearn_model(dt_model,X_train,X_test,y_train,y_test)
show_top_features(trained_dt, vecotorizer_train, X_style_cols_train)
print_missclassified(trained_dt,X_test,y_test)


Training DecisionTreeClassifier...

Classification Report (DecisionTreeClassifier):
              precision    recall  f1-score   support

           0       0.61      0.74      0.67       314
           1       0.67      0.53      0.59       314

    accuracy                           0.63       628
   macro avg       0.64      0.63      0.63       628
weighted avg       0.64      0.63      0.63       628


Top Predictive Features:


Unnamed: 0,feature,importance
3,of,0.186869
37,but,0.05473
1000,sent_neg,0.047019
1008,length,0.03738
958,sponsored,0.035398
233,again,0.030778
492,cnn,0.024374
107,is a,0.023971
319,took,0.021836
117,before,0.020752


Number of misclassified samples: 230


In [124]:
# Random Forest
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=12, random_state=42)
trained_rf = train_sklearn_model(rf_model,X_train,X_test,y_train,y_test)

show_top_features(trained_rf, vecotorizer_train, X_style_cols_train)
print_missclassified(trained_rf,X_test,y_test)


Training RandomForestClassifier...

Classification Report (RandomForestClassifier):
              precision    recall  f1-score   support

           0       0.67      0.85      0.75       314
           1       0.80      0.57      0.67       314

    accuracy                           0.71       628
   macro avg       0.73      0.71      0.71       628
weighted avg       0.73      0.71      0.71       628


Top Predictive Features:


Unnamed: 0,feature,importance
1008,length,0.023579
3,of,0.019926
0,[UNK],0.01843
8,is,0.017981
1003,flesch,0.017802
13,it,0.012389
4,and,0.011299
1001,sent_pos,0.011055
1,the,0.010717
5,a,0.010133


Number of misclassified samples: 180
