In [1]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize

# Natural Language Processing (NLP) libraries
import nltk
from nltk.corpus import stopwords

# Scikit-learn modeling libraries
from sklearn.dummy import DummyClassifier # For baseline model
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text to numbers
from sklearn.linear_model import LogisticRegression # The classifier model
from sklearn.metrics import accuracy_score, classification_report # For evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score # For splitting and validating
from sklearn.pipeline import Pipeline # To chain processing steps

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np



# 1. Data Loading

In [2]:
def load_jsonl_flat(path):
    """Load a JSON Lines file and flatten nested structures."""
    with open(path, 'r', encoding='utf-8') as f:
        records = [json.loads(line) for line in f if line.strip()]
    return json_normalize(records)

# --- Load and flatten training data ---
train_data = load_jsonl_flat('../data/raw/train.jsonl')

# --- Load and flatten Kaggle test data ---
kaggle_data = load_jsonl_flat('../data/raw/kaggle_test.jsonl')

# --- Separate features and target for training ---
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']

# --- Features for Kaggle test set ---
X_kaggle = kaggle_data  # Kaggle test set usually has no label

# 2. Transform to DataFrame

In [3]:
# Define a function to get the full text from a tweet object.
# Tweets can be truncated, storing the full version in 'extended_tweet.full_text'.
def extract_full_text(tweet):
    # Start with the standard 'text' field
    text = tweet['text']
    # Check if the 'extended_tweet.full_text' field exists (is not NaN)
    if not pd.isna(tweet['extended_tweet.full_text']):
        # If it exists, it's the full text, so use it instead
        text = tweet['extended_tweet.full_text']
    return text

# Apply this function to every row (axis=1) in the training data
X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
# Apply the same function to the Kaggle test data
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

In [4]:
nltk.download('stopwords')

# Load a list of common French stop words (e.g., 'le', 'la', 'de')
french_stop_words = stopwords.words('french')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/milapopovic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 3. CamemBERT

In [None]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print("Using device:", device)


MODEL_NAME = "camembert-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
bert_model.eval()


# ----------------------------------------------------
# CAMEMBERT EMBEDDINGS
# ----------------------------------------------------
def embed_text_camembert(texts, batch_size=16):
    """Return a numpy matrix of CamemBERT embeddings (CLS token)."""
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        tokens = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = bert_model(**tokens)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  

        all_embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(all_embeddings)


# ----------------------------------------------------
# METADATA EXTRACTION
# ----------------------------------------------------
def extract_features(df):
    df = df.copy()

    df['tweet_length'] = df['full_text'].apply(lambda x: len(str(x)))
    df['word_count'] = df['full_text'].apply(lambda x: len(str(x).split()))
    df['hashtags_count'] = df['full_text'].apply(lambda x: str(x).count('#'))
    df['mentions_count'] = df['full_text'].apply(lambda x: str(x).count('@'))
    df['urls_count'] = df['full_text'].apply(lambda x: str(x).count('http'))

    # â˜… Your most predictive non-text feature
    df['total_tweets'] = df['user.statuses_count']

    # Simple binary feature
    df['is_location_available'] = df['user.location'].apply(
        lambda x: 0 if pd.isna(x) or x == '' else 1
    )

    return df


# ----------------------------------------------------
# APPLY FEATURE ENGINEERING
# ----------------------------------------------------
X_train_f = extract_features(X_train)
X_test_f = extract_features(X_kaggle)

numeric_columns = [
    'tweet_length', 'word_count', 'hashtags_count', 'mentions_count', 'urls_count',
    'total_tweets', 'is_location_available'
]

metadata_train = X_train_f[numeric_columns].values
metadata_test = X_test_f[numeric_columns].values


# ----------------------------------------------------
# EMBED TEXT
# ----------------------------------------------------
print("Embedding training tweets with CamemBERT...")
bert_train = embed_text_camembert(X_train_f["full_text"].tolist())

print("Embedding test tweets with CamemBERT...")
bert_test = embed_text_camembert(X_test_f["full_text"].tolist())


# ----------------------------------------------------
# COMBINE FEATURES
# ----------------------------------------------------
X_train_combined = np.hstack([bert_train, metadata_train])
X_test_combined = np.hstack([bert_test, metadata_test])

print("Final feature shape:", X_train_combined.shape)


# ----------------------------------------------------
# TRAIN XGBOOST
# ----------------------------------------------------
clf = XGBClassifier(
    n_estimators=300,        # reduced to avoid kernel crash
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)

clf.fit(X_train_combined, y_train)

y_pred_test = clf.predict(X_test_combined)


# ----------------------------------------------------
# SAVE PREDICTIONS
# ----------------------------------------------------
output = pd.DataFrame({
    "ID": X_test_f["challenge_id"].astype(int),
    "Prediction": y_pred_test
})

output.to_csv("camembert_metadata_submission.csv", index=False)
print("Saved camembert_metadata_submission.csv")

: 