In [3]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize

# Natural Language Processing (NLP) libraries
import nltk
from nltk.corpus import stopwords

# Scikit-learn modeling libraries
from sklearn.dummy import DummyClassifier # For baseline model
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text to numbers
from sklearn.linear_model import LogisticRegression # The classifier model
from sklearn.metrics import accuracy_score, classification_report # For evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score # For splitting and validating
from sklearn.pipeline import Pipeline # To chain processing steps

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# 1. Data Loading

In [4]:
def load_jsonl_flat(path):
    """Load a JSON Lines file and flatten nested structures."""
    with open(path, 'r', encoding='utf-8') as f:
        records = [json.loads(line) for line in f if line.strip()]
    return json_normalize(records)

# --- Load and flatten training data ---
train_data = load_jsonl_flat('../data/raw/train.jsonl')

# --- Load and flatten Kaggle test data ---
kaggle_data = load_jsonl_flat('../data/raw/kaggle_test.jsonl')

# --- Separate features and target for training ---
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']

# --- Features for Kaggle test set ---
X_kaggle = kaggle_data  # Kaggle test set usually has no label

# 2 Transforming into DataFrame

In [None]:
# Define a function to get the full text from a tweet object.
# Tweets can be truncated, storing the full version in 'extended_tweet.full_text'.
def extract_full_text(tweet):
    # Start with the standard 'text' field
    text = tweet['text']
    # Check if the 'extended_tweet.full_text' field exists (is not NaN)
    if not pd.isna(tweet['extended_tweet.full_text']):
        # If it exists, it's the full text, so use it instead
        text = tweet['extended_tweet.full_text']
    return text

# Apply this function to every row (axis=1) in the training data
X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
# Apply the same function to the Kaggle test data
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

In [6]:
nltk.download('stopwords')

# Load a list of common French stop words (e.g., 'le', 'la', 'de')
french_stop_words = stopwords.words('french')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/milapopovic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 3. Improved LR with metadata

In [None]:
# --------------------------
# Extract metadata features
# --------------------------
def extract_features(df):
    df = df.copy()
    df['tweet_length'] = df['full_text'].apply(lambda x: len(str(x)))
    df['word_count'] = df['full_text'].apply(lambda x: len(str(x).split()))
    df['hashtags_count'] = df['full_text'].apply(lambda x: str(x).count('#'))
    df['mentions_count'] = df['full_text'].apply(lambda x: str(x).count('@'))
    df['urls_count'] = df['full_text'].apply(lambda x: str(x).count('http'))
    return df

X_train_features = extract_features(X_train)
X_kaggle_features = extract_features(X_kaggle)

# --------------------------
# ColumnTransformer for text + metadata
# --------------------------
text_column = 'full_text'
numeric_columns = ['tweet_length', 'word_count', 'hashtags_count', 'mentions_count', 'urls_count']

preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(
        stop_words=french_stop_words,
        max_df=0.85,
        min_df=2,
        max_features=5000,
        ngram_range=(1,3)
    ), text_column),
    ('num', StandardScaler(), numeric_columns)
])

# --------------------------
# Full pipeline
# --------------------------
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(
        solver='liblinear',
        random_state=42,
        class_weight='balanced'
    ))
])

# --------------------------
# Cross-validation
# --------------------------
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    model_pipeline,
    X_train_features,
    y_train,
    cv=kfold,
    scoring='accuracy'
)

print(f"K-Fold Accuracy Scores: {scores}")
print(f"Mean Accuracy: {np.mean(scores)*100:.2f}%")
print(f"Std Dev: {np.std(scores)*100:.2f}%")

# --------------------------
# Train final model on all training data
# --------------------------
model_pipeline.fit(X_train_features, y_train)
print("Final model trained!")

# --------------------------
# Predict on Kaggle test set
# --------------------------
y_pred_test = model_pipeline.predict(X_kaggle_features)

# Prepare submission
output = pd.concat([X_kaggle_features['challenge_id'], pd.DataFrame(y_pred_test)], axis=1, ignore_index=True)
output.columns = ['ID', 'Prediction']
output.to_csv('logistic_regression_metadata.csv', index=False)
print("Submission saved as logistic_regression_metadata.csv")


K-Fold Accuracy Scores: [0.64645128 0.64180357 0.63783365 0.64364329 0.64495514]
Mean Accuracy: 64.29%
Std Dev: 0.30%
Final model trained!
Submission saved as logistic_regression_metadata.csv
