In [1]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize

# Natural Language Processing (NLP) libraries
import nltk
from nltk.corpus import stopwords

# Scikit-learn modeling libraries
from sklearn.dummy import DummyClassifier # For baseline model
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text to numbers
from sklearn.linear_model import LogisticRegression # The classifier model
from sklearn.metrics import accuracy_score, classification_report # For evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score # For splitting and validating
from sklearn.pipeline import Pipeline # To chain processing steps

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
nltk.download('stopwords')

# Load a list of common French stop words (e.g., 'le', 'la', 'de')
french_stop_words = stopwords.words('french')

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2025/iuliia.korotkova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def load_jsonl_flat(path):
    """Load a JSON Lines file and flatten nested structures."""
    with open(path, 'r', encoding='utf-8') as f:
        records = [json.loads(line) for line in f if line.strip()]
    return json_normalize(records)

# --- Load and flatten training data ---
train_data = load_jsonl_flat('../data/raw/train.jsonl')

# --- Load and flatten Kaggle test data ---
kaggle_data = load_jsonl_flat('../data/raw/kaggle_test.jsonl')

# --- Separate features and target for training ---
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']

# --- Features for Kaggle test set ---
X_kaggle = kaggle_data  # Kaggle test set usually has no label

In [4]:
# Define a function to get the full text from a tweet object.
# Tweets can be truncated, storing the full version in 'extended_tweet.full_text'.
def extract_full_text(tweet):
    # Start with the standard 'text' field
    text = tweet['text']
    # Check if the 'extended_tweet.full_text' field exists (is not NaN)
    if not pd.isna(tweet['extended_tweet.full_text']):
        # If it exists, it's the full text, so use it instead
        text = tweet['extended_tweet.full_text']
    return text

# Apply this function to every row (axis=1) in the training data
X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
# Apply the same function to the Kaggle test data
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

In [5]:
X_train = X_train.dropna(how='all', axis="columns")
X_train = X_train.drop(X_train.columns.difference(X_kaggle.columns).to_list(), axis=1)

text_column = 'full_text'
categorical_column = 'source' 
num_columns = X_train.select_dtypes(include=[np.number]).columns.tolist()
bool_columns = X_train.select_dtypes(include=[np.bool]).columns.tolist()
list_columns = []
for col in X_train.columns:
    if X_train[col].apply(lambda x: isinstance(x, list)).any():
        list_columns.append(col)

unuseful_columns = ["lang", "text", "extended_tweet.full_text", "user.description",
    'retweet_count',
    'favorite_count',
    'quote_count',
    'reply_count',
    'retweeted',
    'favorited',
    'user.default_profile_image',
    'user.protected',
    'user.contributors_enabled'
    ]

num_columns = [col for col in num_columns if col not in unuseful_columns]
bool_columns = [col for col in bool_columns if col not in unuseful_columns]
list_columns = [col for col in list_columns if col not in unuseful_columns]

In [6]:
def extract_features(df, num_columns, bool_columns, list_columns, unuseful_columns):
    df = df.copy()

    # numerical data
    df[num_columns] = df[num_columns].fillna(0).replace([np.inf, -np.inf], 0)

    # boolean data
    for column in bool_columns:
        df[column] = df[column].map({True: 1, False: 0})

    # list data
    for col in list_columns:
        df[col] = df[col].apply(lambda x: len(x) if isinstance(x, list) else 0)

    # unuseful data
    df = df.drop(unuseful_columns, axis=1)

    return df

In [7]:
X_train_features = extract_features(X_train, num_columns, bool_columns, list_columns, unuseful_columns)
X_kaggle_features = extract_features(X_kaggle, num_columns, bool_columns, list_columns, unuseful_columns)

In [8]:
X_train_features.shape, X_kaggle_features.shape

((154914, 155), (103380, 175))

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(
            stop_words=french_stop_words,
            max_df=0.85,
            min_df=2,
            max_features=5000,
            ngram_range=(1,3)
        ), text_column),
        ('num', StandardScaler(), num_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), [categorical_column])
    ],
    remainder='drop'
)

In [10]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(
        solver='liblinear',
        random_state=42,
        class_weight='balanced',
        max_iter=1000 # Increased max_iter for robustness
    ))
])

In [11]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    model_pipeline,
    X_train_features, # Use DataFrame with extracted features
    y_train,
    cv=kfold,
    scoring='accuracy'
)

print(f"K-Fold Accuracy Scores (Text + Full Metadata): {scores}")
print(f"Mean Accuracy: {np.mean(scores)*100:.2f}%")
print(f"Std Dev: {np.std(scores)*100:.2f}%")

K-Fold Accuracy Scores (Text + Full Metadata): [0.79685634 0.79566214 0.79556531 0.7981151  0.79517139]
Mean Accuracy: 79.63%
Std Dev: 0.11%


In [None]:
model_pipeline.fit(X_train_features, y_train)
print("Final model trained!")

In [None]:
y_pred_test = model_pipeline.predict(X_kaggle_features)

In [None]:
output = pd.concat([X_kaggle_features['challenge_id'], pd.Series(y_pred_test)], axis=1, ignore_index=True)
output.columns = ['ID', 'Prediction']
output['ID'] = output['ID'].astype(int)
output.to_csv('logistic_regression_metadata.csv', index=False)
print("Submission saved as logistic_regression_full_metadata.csv")