IMPORTS

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

READ DATASETS

In [2]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [3]:
print(train.shape)
print(f"No target: {train[train["target"] == 0]["text"].values[0]}")
print(f"Target: {train[train["target"] == 1]["text"].values[0]}")

(7613, 5)
No target: What's up man?
Target: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


PREPROCESSING

In [4]:
train['location'].dropna()

31                         Birmingham
32      Est. September 2012 - Bristol
33                             AFRICA
34                   Philadelphia, PA
35                         London, UK
                    ...              
7575                               TN
7577           #NewcastleuponTyne #UK
7579                Vancouver, Canada
7580                          London 
7581                          Lincoln
Name: location, Length: 5080, dtype: object

In [6]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

def text_preprocessing(df):
    df['text_cleaned'] = df['text'].copy()
    # Remove links
    df['text_cleaned'] = df['text_cleaned'].str.replace(r'http\S+|www\.\S+', '', regex=True)
    # Remove mentions
    df['text_cleaned'] = df['text_cleaned'].str.replace(r'@\w+', '', regex=True)
    # Remove special caracters
    df['text_cleaned'] = df['text_cleaned'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)
    # Remove trailing sapces
    df['text_cleaned'] = df['text_cleaned'].str.strip()
    # Remove multiple spaces
    df['text_cleaned'] = df['text_cleaned'].str.replace(r'\s+', ' ', regex=True)
    # Lower
    df['text_cleaned'] = df['text_cleaned'].str.lower()
    return df
    
def extract_cities_countries_continents(location):
    continents = ["africa", "europe", "asia", "north america", "south america", "antartica", "oceania", "Nigeria", "Kenya"]
    countries = [
        ('france', 'europe'),
        ('united states', 'north america'),
        ('united kingdom', 'europe'),
        ('germany', 'europe'),
        ('india', 'asia'),
        ('australia', 'oceania'),
        ('canada', 'north america'),
        ('japan', 'asia'),
        ('indonesia', 'asia'),
        ('ireland', 'europe'),
    ]
    cities = [
        ('new york', 'united states', "north america"),
        ('london', 'united kingdom', "europe"),
        ('los angeles', 'united states', "north america"),
        ('mumbai', 'india', "asia"),
        ('washington', 'united states', "north america"),
        ('chicago', 'united states', "north america"),
        ('san francisco', 'united states', "north america"),
        ('toronto', 'canada', "north america"),
        ('seattle', 'united states', "north america"),
        ('atlanta', 'united states', "north america"),
        ('Nashville', 'united states', "north america"),
    ]

    location = location.replace('ny', 'new york')
    location = location.replace('nyc', 'new york')
    location = location.replace('new york city', 'new york')
    location = location.replace('la', 'los angeles')

    location = location.replace('uk', 'united kingdom')
    location = location.replace('usa', 'united states')

    
    res = "unknown"
    for city, country, continent in cities:
        if city in location:
            return pd.Series([city, country, continent])
    for country, continent in countries:
        if country in location:
            return pd.Series(["unknown", country, continent])
    for continent in continents:
        if continent in location:
            return pd.Series(["unknown", "unknown", continent])
        
    return pd.Series(["unknown", "unknown", "unknown"])

def location_preprocessing(df):
    df["location_cleaned"] = df["location"].copy()
    # Lower
    df["location_cleaned"] = df["location_cleaned"].str.lower()
    # Remove trailing sapces
    df['location_cleaned'] = df['location_cleaned'].str.strip()
    # Fill NaN values
    df["location_cleaned"] = df["location_cleaned"].fillna("unknown")
    # Replace everywhere / worlwide by unknown
    df["location_cleaned"] = df["location_cleaned"].str.replace("everywhere", "unknown")
    df["location_cleaned"] = df["location_cleaned"].str.replace("worlwide", "unknown")
    df["location_cleaned"] = df["location_cleaned"].str.replace("earth", "unknown")
    # Extract continents
    df[["city", "country", "continent"]] = df["location_cleaned"].apply(extract_cities_countries_continents)
    # Post
    
    return df

def keyword_preprocessing(df):
    df["keyword_cleaned"] = df["keyword"].copy()
    # Lower
    df["keyword_cleaned"] = df["keyword_cleaned"].str.lower()
    # Remove trailing sapces
    df['keyword_cleaned'] = df['keyword_cleaned'].str.strip()
    # Fill NaN values
    df["keyword_cleaned"] = df["keyword_cleaned"].fillna("unknown")
    # Replace %20 by spaces
    df["keyword_cleaned"] = df["keyword_cleaned"].str.replace("%20", " ")
    # Create a column to indicate if the keyword is relevant
    df["keyword_in_text"] = df.apply(lambda row: row["keyword_cleaned"] in row["text_cleaned"],axis=1)
    
    return df

train = text_preprocessing(train)
test = text_preprocessing(test)

train = location_preprocessing(train)
test = location_preprocessing(test)

train = keyword_preprocessing(train)
test = keyword_preprocessing(test)

In [7]:
train["keyword_in_text"].value_counts()

keyword_in_text
True     6634
False     979
Name: count, dtype: int64

In [8]:
train.dropna(subset=["keyword"])

Unnamed: 0,id,keyword,location,text,target,text_cleaned,location_cleaned,city,country,continent,keyword_cleaned,keyword_in_text
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,wholesale markets ablaze,birmingham,unknown,unknown,unknown,ablaze,True
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,we always try to bring the heavy metal rt,est. september 2012 - bristol,unknown,unknown,unknown,ablaze,False
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,africanbaze breaking newsnigeria flag set abla...,africa,unknown,unknown,africa,ablaze,True
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,crying out for more set me ablaze,"philadelphia, pa",los angeles,united states,north america,ablaze,True
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,on plus side look at the sky last night it was...,"london, uk",london,united kingdom,europe,ablaze,True
...,...,...,...,...,...,...,...,...,...,...,...,...
7578,10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0,and i wrecked you both,unknown,unknown,unknown,unknown,wrecked,True
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0,three days off from work and theyve pretty muc...,"vancouver, canada",unknown,canada,north america,wrecked,True
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0,fx forex trading cramer igers 3 words that wre...,london,london,united kingdom,europe,wrecked,True
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0,great atmosphere at the british lion gig tonig...,lincoln,unknown,unknown,unknown,wrecked,True


In [9]:
tfidf_vetorizer = TfidfVectorizer()
train_vectors = tfidf_vetorizer.fit_transform(train["text_cleaned"])
test_vectors = tfidf_vetorizer.transform(test["text_cleaned"])

In [10]:
train_vectors.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], shape=(7613, 15741))

MODEL SELECTION

In [11]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.svm import LinearSVC

from sklearn. model_selection import cross_val_score

In [12]:
# Define your models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Ridge Classifier": RidgeClassifier(),
    "Passive Aggressive": PassiveAggressiveClassifier(),
    "SGD (Log Loss)": SGDClassifier(loss='log_loss'),
}

# Columns
text_col = 'text_cleaned'
city_col = 'city'
country_col = 'country'
continent_col = 'continent'
keyword_col = 'keyword'
keyword_in_text_col = 'keyword_in_text'

for name, model in models.items():
    print(f"\n=== {name} ===")
    
    # Preprocessing pipeline for current model
    preprocessor = ColumnTransformer(transformers=[
        ('tfidf', TfidfVectorizer(), text_col),
        ('city_ohe', OneHotEncoder(handle_unknown='ignore'), [city_col]),
        ('country_ohe', OneHotEncoder(handle_unknown='ignore'), [country_col]),
        ('continent_ohe', OneHotEncoder(handle_unknown='ignore'), [continent_col]),
        ('keyword_ohe', OneHotEncoder(handle_unknown='ignore'), [keyword_col]),
        ('keyword_in_text_ohe', OneHotEncoder(handle_unknown='ignore'), [keyword_in_text_col]),

    ])
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    scores = cross_val_score(pipeline, train, train["target"], cv=5, scoring="f1")
    print(f"{name} F1 scores: {scores}")
    print(f"{name} Mean F1: {scores.mean():.4f}")


=== Logistic Regression ===
Logistic Regression F1 scores: [0.51821862 0.43544304 0.54586808 0.53846154 0.66904932]
Logistic Regression Mean F1: 0.5414

=== Linear SVM ===
Linear SVM F1 scores: [0.56428571 0.44115355 0.57894737 0.56872038 0.67104399]
Linear SVM Mean F1: 0.5648

=== Ridge Classifier ===
Ridge Classifier F1 scores: [0.56505576 0.44217152 0.56735567 0.57027464 0.67204301]
Ridge Classifier Mean F1: 0.5634

=== Passive Aggressive ===
Passive Aggressive F1 scores: [0.53776978 0.4169279  0.56       0.56498873 0.64190981]
Passive Aggressive Mean F1: 0.5443

=== SGD (Log Loss) ===
SGD (Log Loss) F1 scores: [0.57070279 0.42517007 0.55322339 0.54637437 0.68082368]
SGD (Log Loss) Mean F1: 0.5553


LSTM

In [25]:
!python --version

Python 3.13.1


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split


ModuleNotFoundError: No module named 'tensorflow'

ANALYZE ERRORS

In [188]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [219]:
# Split
X_train, X_test, y_train, y_test = train_test_split(train[["keyword", "location", "text_cleaned"]], train["target"], test_size=0.2, random_state=42)

# Columns
text_col = 'text_cleaned'
keyword_col = 'keyword'
location_col = 'location'

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), text_col),
        ('keyword_ohe', OneHotEncoder(handle_unknown='ignore'), [keyword_col]),
        ('location_ohe', OneHotEncoder(handle_unknown='ignore'), [location_col]),
    ],
    remainder='drop'  # drop other columns
)

# Full pipeline with classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Fit on training data (pandas DataFrame)
pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = pipeline.predict(X_test)

results = pd.DataFrame({
    "keyword": X_test["keyword"],
    "location": X_test["location"],
    "text": X_test["text_cleaned"],
    "true_label": y_test,
    "predicted": y_pred
})

In [220]:
false_positive = results[(results["true_label"] == 0) & (results["predicted"] == 1)]
true_positive = results[(results["true_label"] == 1) & (results["predicted"] == 1)]
false_negative = results[(results["true_label"] == 1) & (results["predicted"] == 0)]
true_negative = results[(results["true_label"] == 0) & (results["predicted"] == 0)]

print(f" false positive: {false_positive.shape}")
print(f" true positive: {true_positive.shape}")
print(f" false negative: {false_negative.shape}")
print(f" true negative: {true_negative.shape}")

 false positive: (139, 5)
 true positive: (463, 5)
 false negative: (186, 5)
 true negative: (735, 5)


In [223]:
false_positive["keyword"].value_counts()

keyword
hazardous        5
crashed          4
flooding         4
trauma           4
floods           3
                ..
evacuation       1
emergency        1
bombed           1
mass%20murder    1
crash            1
Name: count, Length: 77, dtype: int64

In [224]:
true_positive.head()

Unnamed: 0,keyword,location,text,true_label,predicted
5448,police,UK,dt rt the col police can catch a pickpocket in...,1,1
4398,hijacking,"Athens,Greece",the murderous story of americas first hijacking,1,1
1807,crash,In my own world!!!,akilah world news cop pulls man from car to av...,1,1
2164,debris,,malaysia airlines flight 370 that disappeared ...,1,1
3044,earthquake,"Melbourne, Australia",nepal earthquake 3 months on women fear abuse via,1,1


In [None]:
# Texts with links
print(false_positive[false_positive["text"].str.contains("http://")].shape)
print(true_positive[true_positive["text"].str.contains("http://")].shape)

(80, 3)
(350, 3)


RESULTS

In [57]:
sample_submission = pd.read_csv("sample_submission_template.csv")

In [58]:
sample_submission["target"] = clf.predict(test_vectors)

In [60]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [61]:
sample_submission.to_csv("submission.csv", index=False)