# Dat550 Project

In [104]:
import json
import string
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
import textstat  # pip install textstat
from tqdm import tqdm
tqdm.pandas()

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/magnus/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [105]:
# df = pd.read_xml("Webis-data/extracted/ground-truth-training-byarticle-20181122.xml")
# df.to_json("preprocessing/data/ground-truth-training-byarticle.jsonl",orient="records",lines=True)

# df = pd.read_xml("Webis-data/extracted/ground-truth-training-bypublisher-20181122.xml")
# df.to_json("preprocessing/data/ground-truth-training-bypublisher.jsonl",orient="records",lines=True)

# df = pd.read_xml("Webis-data/extracted/ground-truth-test-bypublisher-20181212.xml")
# df.to_json("preprocessing/data/ground-truth-test-bypublisher.jsonl",orient="records",lines=True)

# df = pd.read_xml("Webis-data/extracted/ground-truth-test-byarticle-20181207.xml")
# df.to_json("preprocessing/data/ground-truth-test-byarticle.jsonl",orient="records",lines=True)


In [106]:
filepath = "preprocessing/data/articles-training-bypublisher.jsonl"

def load_json(filepath):

    articles = []
    table = str.maketrans("","",string.punctuation+"“”‘’")

    with open(filepath, "r") as f:
        for line in f:
            data = json.loads(line)
            title = data["title"].lower().translate(table)
            content = data["content"].lower().translate(table)
            articles.append({
                "id": int(data["id"]),
                "content": f"{title} {content}"
            })
    

    return pd.DataFrame(articles)



def load_ground_truth(filepath):
    return pd.read_json(filepath, orient="records", lines=True)

def merge_with_ground_truth(articles_df, ground_truth_df):
    return articles_df.merge(ground_truth_df[['id', 'hyperpartisan']], on='id', how='left')
        

In [107]:
def extract_features(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)

    features = {
        'sent_neg': sentiment['neg'],
        'sent_pos': sentiment['pos'],
        'sent_compound': sentiment['compound'],
        'flesch': textstat.flesch_reading_ease(text),
        'smog': textstat.smog_index(text),
        'exclam': text.count('!'),
        'questions': text.count('?'),
        'quotes': text.count('"'),
        'length': len(text.split())
    }

    partisan_terms = {
        'far_left': ['socialist', 'progressive', 'woke'],
        'far_right': ['maga', 'conservative', 'patriot']
    }

    for group, terms in partisan_terms.items():
        features[f'count_{group}'] = sum(text.count(term) for term in terms)

    return features


In [108]:
def prepare_data(article_path, truth_path):
    print("Loading and merging data...")
    articles_df = load_json(article_path)
    ground_truth_df = load_ground_truth(truth_path)
    df = merge_with_ground_truth(articles_df, ground_truth_df)

    # Filter out samples with missing labels
    df = df.dropna(subset=['hyperpartisan'])
    df['label'] = df['hyperpartisan'].astype(int)
    return df


In [109]:
def vectorize_text(df):
    print("Vectorizing text with TF-IDF...")
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        max_features=1000,
    )
    X_text = tfidf.fit_transform(df['content'])
    return X_text, tfidf


In [110]:
def extract_stylometric_features(df):
    print("Extracting stylometric and sentiment features...")
    style_features = df['content'].progress_apply(extract_features)
    return pd.DataFrame(style_features.tolist())


In [111]:
def combine_features(X_text, X_style):
    print("Combining TF-IDF and stylometric features...")
    
    # Convert column names to string types
    X_text = pd.DataFrame(X_text.toarray()).reset_index(drop=True)
    X_style = X_style.reset_index(drop=True)
    
    # Ensure that all column names are strings
    X_text.columns = X_text.columns.astype(str)
    X_style.columns = X_style.columns.astype(str)
    
    # Concatenate the dataframes (TF-IDF + stylometric features)
    X_all = pd.concat([X_text, X_style], axis=1)
    
    return X_all


In [112]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    print("Training Logistic Regression model...")
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='liblinear'
    )
    model.fit(X_train, y_train)

    print("\nModel Evaluation:")
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))

    return model


In [113]:
def show_top_features(model, tfidf, X_style):
    print("\nTop Predictive Features:")
    feature_names = tfidf.get_feature_names_out().tolist() + X_style.columns.tolist()
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': model.coef_[0]
    }).sort_values('coefficient', key=abs, ascending=False)
    display(coef_df.head(20))


In [114]:
# Load and process articles
df = prepare_data(
    "preprocessing/data/articles-training-byarticle.jsonl",
    "preprocessing/data/ground-truth-training-byarticle.jsonl"
)



Loading and merging data...


In [115]:
X_text, tfidf = vectorize_text(df)
X_style = extract_stylometric_features(df)
X_train = combine_features(X_text, X_style)




Vectorizing text with TF-IDF...
Extracting stylometric and sentiment features...


100%|██████████| 645/645 [00:09<00:00, 65.03it/s]

Combining TF-IDF and stylometric features...





In [116]:
test_articles_path = "preprocessing/data/articles-test-byarticle.jsonl"
test_ground_truth_path = "preprocessing/data/ground-truth-test-byarticle.jsonl"

# Load the test data and ground truth as DataFrames
test_articles = pd.read_json(test_articles_path, orient="records", lines=True)
test_ground_truth = pd.read_json(test_ground_truth_path, orient="records", lines=True)

test_articles = test_articles.drop(columns=['hyperpartisan'])
# Merge the two DataFrames
test_data_df = pd.merge(test_articles, test_ground_truth[['id', 'hyperpartisan']], on='id')


X_text_test, tfidf = vectorize_text(test_data_df)
X_style_test = extract_stylometric_features(test_data_df)
X_test = combine_features(X_text_test, X_style_test)


Vectorizing text with TF-IDF...
Extracting stylometric and sentiment features...


100%|██████████| 628/628 [00:10<00:00, 60.43it/s]

Combining TF-IDF and stylometric features...





In [117]:
# Load pre-split data
# Or manually split just once and save using joblib.dump()
y_train = df["hyperpartisan"].astype(int)
y_test = test_data_df["hyperpartisan"].astype(int)

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)


In [118]:
# Train & evaluate
model = train_and_evaluate(X_train, X_test, y_train, y_test)
show_top_features(model, tfidf, X_style)


Training Logistic Regression model...

Model Evaluation:
              precision    recall  f1-score   support

           0       0.51      0.98      0.67       314
           1       0.74      0.04      0.08       314

    accuracy                           0.51       628
   macro avg       0.62      0.51      0.38       628
weighted avg       0.62      0.51      0.38       628


Top Predictive Features:


Unnamed: 0,feature,coefficient
827,staff,1.570803
904,trump said,1.508388
391,high,1.473198
762,russia,-1.411174
658,playing,-1.285365
678,president,1.220695
977,women,1.166032
55,american people,1.126042
54,american,1.075763
662,policy,1.044966


In [119]:

misclassified = X_test[y_test != model.predict(X_test)]
print("Number of misclassified samples:", misclassified.shape[0])

Number of misclassified samples: 305


In [120]:
def predict_single_article(article_text, tfidf, model, X_style_columns):
    # Vectorize the text
    X_text = tfidf.transform([article_text])

    # Extract stylometric features
    style_features = extract_features(article_text)
    X_style = pd.DataFrame([style_features])

    # Ensure correct column order (in case some features were missing)
    X_style = X_style.reindex(columns=X_style_columns, fill_value=0)

    # Combine features
    X_all = combine_features(X_text, X_style)

    # Predict
    prediction = model.predict(X_all)[0]
    confidence = model.predict_proba(X_all)[0].max()
    return prediction, confidence


# Save stylometric feature columns from training to keep consistent
X_style_columns = X_style.columns

# Predict the third article (index 2) content from test_articles
article_text = test_articles["content"].iloc[2]

# Predict
pred, conf = predict_single_article(article_text, tfidf, model, X_style_columns)

print(f"Prediction: {'Hyperpartisan' if pred == 1 else 'Not Hyperpartisan'} (Confidence: {conf:.2f})")


Combining TF-IDF and stylometric features...
Prediction: Not Hyperpartisan (Confidence: 0.84)
