# Dat550 Project

In [1]:
import json
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
import textstat  # pip install textstat
from tqdm.notebook import tqdm  # Better for notebooks
tqdm.pandas()


In [2]:
# df = pd.read_xml("Webis-data/extracted/ground-truth-training-byarticle-20181122.xml")
# df.to_json("preprocessing/data/ground-truth-training-byarticle.jsonl",orient="records",lines=True)

# df = pd.read_xml("Webis-data/extracted/ground-truth-training-bypublisher-20181122.xml")
# df.to_json("preprocessing/data/ground-truth-training-bypublisher.jsonl",orient="records",lines=True)

# df = pd.read_xml("Webis-data/extracted/ground-truth-test-bypublisher-20181212.xml")
# df.to_json("preprocessing/data/ground-truth-test-bypublisher.jsonl",orient="records",lines=True)

# df = pd.read_xml("Webis-data/extracted/ground-truth-test-byarticle-20181207.xml")
# df.to_json("preprocessing/data/ground-truth-test-byarticle.jsonl",orient="records",lines=True)


In [None]:
filepath = "preprocessing/data/articles-training-bypublisher.jsonl"

def load_json(filepath):

    articles = []
    table = str.maketrans("","",string.punctuation+"“”‘’")

    with open(filepath, "r") as f:
        for line in f:
            data = json.loads(line)
            title = data["title"].lower().translate(table)
            content = data["content"].lower().translate(table)
            articles.append({
                "id": int(data["id"]),
                "content": f"{title} {content}"
            })
    

    return pd.DataFrame(articles)



def load_ground_truth(filepath):
    return pd.read_json(filepath, orient="records", lines=True)

def merge_with_ground_truth(articles_df, ground_truth_df):
    return articles_df.merge(ground_truth_df[['id', 'hyperpartisan']], on='id', how='left')
        

In [4]:
def extract_features(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)

    features = {
        'sent_neg': sentiment['neg'],
        'sent_pos': sentiment['pos'],
        'sent_compound': sentiment['compound'],
        'flesch': textstat.flesch_reading_ease(text),
        'smog': textstat.smog_index(text),
        'exclam': text.count('!'),
        'questions': text.count('?'),
        'quotes': text.count('"'),
        'length': len(text.split())
    }

    partisan_terms = {
        'far_left': ['socialist', 'progressive', 'woke'],
        'far_right': ['maga', 'conservative', 'patriot']
    }

    for group, terms in partisan_terms.items():
        features[f'count_{group}'] = sum(text.count(term) for term in terms)

    return features


In [5]:
def prepare_data(article_path, truth_path):
    print("Loading and merging data...")
    articles_df = load_json(article_path)
    ground_truth_df = load_ground_truth(truth_path)
    df = merge_with_ground_truth(articles_df, ground_truth_df)

    # Filter out samples with missing labels
    df = df.dropna(subset=['hyperpartisan'])
    df['label'] = df['hyperpartisan'].astype(int)
    return df


In [14]:
def vectorize_text(df):
    print("Vectorizing text with TF-IDF...")
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        max_features=1000,
    )
    X_text = tfidf.fit_transform(df['content'])
    return X_text, tfidf


In [7]:
def extract_stylometric_features(df):
    print("Extracting stylometric and sentiment features...")
    style_features = df['content'].progress_apply(extract_features)
    return pd.DataFrame(style_features.tolist())


In [8]:
def combine_features(X_text, X_style):
    print("Combining TF-IDF and stylometric features...")
    X_all = pd.concat([
        pd.DataFrame(X_text.toarray()).reset_index(drop=True),
        X_style.reset_index(drop=True)
    ], axis=1)
    return X_all



In [9]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    print("Training Logistic Regression model...")
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='liblinear'
    )
    model.fit(X_train, y_train)

    print("\nModel Evaluation:")
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))

    return model


In [24]:
def show_top_features(model, tfidf, X_style):
    print("\nTop Predictive Features:")
    feature_names = tfidf.get_feature_names_out().tolist() + X_style.columns.tolist()
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': model.coef_[0]
    }).sort_values('coefficient', key=abs, ascending=False)
    display(coef_df.head(20))


In [None]:
# Load and process articles
df = prepare_data(
    "preprocessing/data/articles-training-byarticle.jsonl",
    "preprocessing/data/ground-truth-training-byarticle.jsonl"
)



Loading and merging data...


In [None]:
X_text, tfidf = vectorize_text(df)
# X_style = extract_stylometric_features(df)
# X_all = combine_features(X_text, X_style)




Vectorizing text with TF-IDF...


In [None]:
# Load pre-split data
# Or manually split just once and save using joblib.dump()
X_train, X_test, y_train, y_test = train_test_split(
    X_all, df['label'].values, test_size=0.3, stratify=df['label'].values, random_state=42
)



In [None]:
# Train & evaluate
model = train_and_evaluate(X_train, X_test, y_train, y_test)
show_top_features(model, tfidf, X_style)