In [None]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
def load_jsonl_flat(path):
    """Load a JSON Lines file and flatten nested structures."""
    with open(path, 'r', encoding='utf-8') as f:
        records = [json.loads(line) for line in f if line.strip()]
    return json_normalize(records)

# --- Load and flatten training data ---
train_data = load_jsonl_flat('../data/raw/train.jsonl')

# --- Load and flatten Kaggle test data ---
kaggle_data = load_jsonl_flat('../data/raw/kaggle_test.jsonl')

# --- Separate features and target for training ---
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']

# --- Features for Kaggle test set ---
X_kaggle = kaggle_data  # Kaggle test set usually has no label

In [3]:
def extract_full_text(tweet):
    text = tweet['text']
    if not pd.isna(tweet['extended_tweet.full_text']):
        text = tweet['extended_tweet.full_text']
    user_description = tweet["user.description"]
    input_text = f"[user description] {user_description}\n\n[tweet content] {text}"
    return input_text

X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

In [None]:
# drop all na
X_train = X_train.dropna(how='all', axis="columns")

# numerical data
num_columns = X_train.select_dtypes(include=[np.number]).columns.tolist()
X_train[num_columns] = X_train[num_columns].fillna(0).replace([np.inf, -np.inf], 0)

# boolean data
bool_columns = X_train.select_dtypes(include=[np.bool]).columns.tolist()
for column in bool_columns:
    X_train[column] = X_train[column].map({True: 1, False: 0})

# list data
list_columns = []
for col in X_train.columns:
    if X_train[col].apply(lambda x: isinstance(x, list)).any():
        list_columns.append(col)
        X_train[col] = X_train[col].apply(lambda x: len(x) if isinstance(x, list) else 0)

# unuseful data
unuseful_columns = ["lang", "text", "extended_tweet.full_text", "user.description",
 'retweet_count',
 'favorite_count',
 'quote_count',
 'reply_count',
 'retweeted',
 'favorited',
 'user.default_profile_image',
 'user.protected',
 'user.contributors_enabled'
 ]
X_train = X_train.drop(unuseful_columns, axis=1)

In [None]:
meta_scaler = StandardScaler()
meta_scaler.fit_transform(X_train[num_columns])

In [None]:
X_train.drop(list_columns + num_columns + bool_columns + unuseful_columns, axis=1).info()

In [6]:
X_metadata = X_train[list_columns + num_columns + bool_columns]

In [8]:
X_metadata[num_columns]

Unnamed: 0,in_reply_to_status_id,quoted_status_id,retweet_count,in_reply_to_user_id,favorite_count,quote_count,reply_count,challenge_id,quoted_status.in_reply_to_status_id,quoted_status.retweet_count,...,quoted_status.user.friends_count,quoted_status.user.listed_count,quoted_status.user.favourites_count,quoted_status.user.id,quoted_status.user.statuses_count,quoted_status.user.followers_count,user.listed_count,user.favourites_count,user.statuses_count,quoted_status.quoted_status_id
0,0.000000e+00,1.372171e+18,0,0.000000e+00,0,0,0,1,0.0,43.0,...,747.0,2723.0,427.0,492648852.0,23714.0,1338833.0,5,14154,333,0.000000e+00
1,0.000000e+00,1.372171e+18,0,0.000000e+00,0,0,0,3,0.0,46.0,...,747.0,2723.0,427.0,492648852.0,23714.0,1338833.0,1,8582,3028,0.000000e+00
2,1.372164e+18,0.000000e+00,0,1.066820e+18,0,0,0,5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,27,1229,4238,0.000000e+00
3,0.000000e+00,0.000000e+00,0,0.000000e+00,0,0,0,6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,92,19,1152,0.000000e+00
4,1.372165e+18,0.000000e+00,0,1.522354e+09,0,0,0,7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,1375,1252,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154909,0.000000e+00,1.376940e+18,0,0.000000e+00,0,0,0,258289,0.0,218.0,...,848.0,239.0,3700.0,558730532.0,6073.0,27599.0,0,2513,1515,1.376895e+18
154910,0.000000e+00,0.000000e+00,0,0.000000e+00,0,0,0,258290,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,16,1137,0.000000e+00
154911,0.000000e+00,0.000000e+00,0,0.000000e+00,0,0,0,258291,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10,7674,86547,0.000000e+00
154912,0.000000e+00,0.000000e+00,0,0.000000e+00,0,0,0,258292,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,1397,867,0.000000e+00


In [9]:
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Correlation-based
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X_metadata, y_train)
selected_features = X_metadata.columns[selector.get_support()]

# Mutual information (captures non-linear relationships)
mi_scores = mutual_info_classif(X_metadata, y_train)

  f = msb / msw


In [10]:
selected_features

Index(['entities.user_mentions', 'extended_tweet.extended_entities.media',
       'extended_tweet.entities.media', 'in_reply_to_status_id',
       'user.favourites_count', 'user.statuses_count', 'user.geo_enabled',
       'user.profile_background_tile', 'user.profile_use_background_image',
       'user.default_profile'],
      dtype='object')

In [12]:
selector.scores_

array([1.54669246e+02, 1.10740059e-02, 3.84945275e+01, 3.45921783e+00,
       1.16093843e+02, 1.98629048e+02, 3.46566600e+00, 3.14875362e+01,
       1.14293059e-01, 2.20511365e+01, 8.39596779e+02, 6.54396743e+03,
       2.74106861e+01, 1.34030514e+03, 2.43208789e+03, 7.65239958e+02,
       1.94044049e+03, 1.41906544e+01, 1.44712496e+01, 3.03126624e+01,
       2.35703660e+01, 1.41172590e+01, 3.44027967e+03, 3.44027967e+03,
       5.38785998e+00, 5.38785998e+00, 2.63028229e+01, 5.47792429e+02,
       6.55271065e+02, 7.71719902e+00, 4.83849407e-01, 4.83849407e-01,
       5.32354071e+00, 5.32354071e+00, 1.03010673e+01, 3.24759758e+00,
       7.61538325e+03, 5.37880754e+01,            nan, 2.57629779e+03,
                  nan,            nan,            nan, 2.33500253e-01,
       9.56797743e+00, 1.18932911e+01, 1.01102872e+01, 1.15209465e+01,
       5.36540953e+01, 9.41810189e+01, 1.38788139e+00, 3.36100848e+01,
       1.76888095e+00, 1.82936150e+02, 1.63341111e-01, 1.01589244e+01,
      

In [21]:
X_metadata.columns[[38, 40, 41, 42, 61, 63, 65, 67, 69]].to_list()

['retweet_count',
 'favorite_count',
 'quote_count',
 'reply_count',
 'retweeted',
 'favorited',
 'user.default_profile_image',
 'user.protected',
 'user.contributors_enabled']