# EDA

In [600]:
# libraries
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from collections import defaultdict
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

In [601]:
# read train.csv
df = pd.read_csv('./twitter/train.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## Basic

In [602]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [603]:
df.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [604]:
df.dtypes

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

In [605]:
# check for missing values
missing_per = 100*df.isna().sum()/len(df)
missing_per = round(missing_per,1).astype(str) + '%'
missing_per

id           0.0%
keyword      0.8%
location    33.3%
text         0.0%
target       0.0%
dtype: object

## Targets

In [606]:
# distribution of target variable
df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

## Targets and Independent Variables(Locations and Keywords)


In [607]:
# Locations
print(df['location'].value_counts().head(10))
print(df['location'].value_counts().tail(10))

location
USA                104
New York            71
United States       50
London              45
Canada              29
Nigeria             28
UK                  27
Los Angeles, CA     26
India               24
Mumbai              22
Name: count, dtype: int64
location
Waco, Texas                     1
todaysbigstock.com              1
buenos aires argentina          1
everydaynigerian@gmail.com      1
Surulere Lagos,Home Of Swagg    1
MontrÌ©al, QuÌ©bec              1
Montreal                        1
ÌÏT: 6.4682,3.18287             1
Live4Heed??                     1
Lincoln                         1
Name: count, dtype: int64


In [608]:
# Percentage of disaster tweets with and without location
df_location = df[df['location'].notna()]
df_location_nan = df[df['location'].isna()]
print(f"{round(len(df_location[df_location['target']== 1])/len(df_location),2)} is the percentage of disaster tweets with location")
print(f"{round(len(df_location_nan[df_location_nan['target']== 1])/len(df_location_nan),2)} is the percentage of disaster tweets without location")

0.43 is the percentage of disaster tweets with location
0.42 is the percentage of disaster tweets without location


In [609]:
# Keywords
print(df['keyword'].value_counts().head(10))
print(df['keyword'].value_counts().tail(10))

keyword
fatalities     45
deluge         42
armageddon     42
sinking        41
damage         41
harm           41
body%20bags    41
outbreak       40
evacuate       40
fear           40
Name: count, dtype: int64
keyword
volcano                  27
battle                   26
bush%20fires             25
war%20zone               24
rescue                   22
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, dtype: int64


In [610]:
# Percentage of disaster tweets with and without % in keywords
df_keyword_normal = df[~df['keyword'].str.contains('%', regex=False, na=False)]
df_keyword_special = df[df['keyword'].str.contains('%', regex=False, na=False)]
print(f"{round(len(df_keyword_normal[df_keyword_normal['target']== 1])/len(df_keyword_normal),10)} is the percentage of disaster tweets without % in keyword")
print(f"{round(len(df_keyword_special[df_keyword_special['target']== 1])/len(df_keyword_special),10)} is the percentage of disaster tweets with % in keyword")

0.4018300248 is the percentage of disaster tweets without % in keyword
0.5836909871 is the percentage of disaster tweets with % in keyword


df[per_in_keyword"] = 1 if df[~df['keyword'].str.contains('%', regex=False, na=False)] else 0

# Feature Engineering

In [611]:
contains_percent = df['keyword'].str.contains('%', regex=False, na=False)

# Convert boolean True/False to integer 1/0 and assign to new column
df['per_in_keyword'] = contains_percent.astype(int)

## NLP

### Cleaning 

In [612]:
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace(r'@\w+', '', regex=True)        # Remove @mentions
df['text'] = df['text'].str.replace(r'#\w+', '', regex=True)        # Remove hashtags
df['text'] = df['text'].str.replace(r'http\S+', '', regex=True)     # Remove URLs
df['text'] = df['text'].str.replace(r'[^a-z\s]', '', regex=True)    # Remove punctuation/numbers
df['text'] = df['text'].str.replace(r'[\u00A0\u3000]', ' ', regex=True) # Remove special space types
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True)        # Normalize all whitespace
df['text'] = df['text'].str.strip()                                 # Final trim

In [613]:
# Check for leading/trailing spaces in any row
print(df['text'].apply(lambda x: x.startswith(' ') or x.endswith(' ')).sum())

# Check for multiple spaces inside any row
print(df['text'].apply(lambda x: '  ' in x).sum())

0
0


### Tokenize 

In [614]:
nltk.download('punkt')
df['tokens'] = df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Stopwords

In [615]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [w for w in tokens if w not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lemitizing

In [616]:
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token, pos='v') for token in tokens]
df['tokens'] = df['tokens'].apply(lemmatize_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Train_Test_Split,Modeling and Evaluation

In [617]:
# Turn list of tokens into text
df['clean_text'] = df['tokens'].apply(lambda x: ' '.join(x))

X = df[['clean_text','per_in_keyword']]
y = df['target']  # or your label column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#TF-IDF

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train['clean_text'])
X_test_tfidf = vectorizer.transform(X_test['clean_text'])

model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"{round(f1,2)} is a f1 score of the base model")

0.79 is a f1 score of the base model


In [618]:
# Try different models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": LinearSVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

my_dict = defaultdict(list)
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"{name} - F1 Score: {f1:.4f}")
    print(str(name))
    my_dict[str(name)].append(f1)

# Convert defaultdict to regular dict (optional)
results = dict(my_dict)

# Find the model with the highest F1 score
best_model = None
best_f1 = -1

for model_name, f1_list in results.items():
    # Assuming each list has one or more scores, take max
    max_f1 = max(f1_list)
    if max_f1 > best_f1:
        best_f1 = max_f1
        best_model = model_name

print(f"Best model: {best_model} with F1 score: {best_f1:.4f}")

Logistic Regression - F1 Score: 0.7913
Logistic Regression
Naive Bayes - F1 Score: 0.7988
Naive Bayes
Random Forest - F1 Score: 0.7763
Random Forest
SVM - F1 Score: 0.7677
SVM


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - F1 Score: 0.7687
XGBoost
Best model: Naive Bayes with F1 score: 0.7988


## Grid Search

In [619]:
# 1. Train base Naive Bayes model
base_nb = MultinomialNB()
base_nb.fit(X_train_tfidf, y_train)
y_pred_base = base_nb.predict(X_test_tfidf)
base_f1 = f1_score(y_test, y_pred_base, average='weighted')
print(f"📦 Base Naive Bayes F1 Score: {base_f1:.4f}")

# 2. GridSearchCV with expanded alpha range
param_grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]
}
grid = GridSearchCV(
    MultinomialNB(),
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train_tfidf, y_train)

# 3. Best model evaluation on test set
best_nb = grid.best_estimator_
y_pred_grid = best_nb.predict(X_test_tfidf)
grid_f1 = f1_score(y_test, y_pred_grid, average='weighted')
print(f"GridSearchCV Best Params: {grid.best_params_}")
print(f"GridSearch Naive Bayes F1 Score on Test: {grid_f1:.4f}")

📦 Base Naive Bayes F1 Score: 0.7988
Fitting 5 folds for each of 7 candidates, totalling 35 fits
GridSearchCV Best Params: {'alpha': 1.0}
GridSearch Naive Bayes F1 Score on Test: 0.7988


# Test

In [620]:
df_test = pd.read_csv('./twitter/test.csv')
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


## Preprocessing

In [621]:
contains_percent = df_test['keyword'].str.contains('%', regex=False, na=False)

# Convert boolean True/False to integer 1/0 and assign to new column
df_test['per_in_keyword'] = contains_percent.astype(int)

df_test['text'] = df_test['text'].str.lower()
df_test['text'] = df_test['text'].str.replace(r'@\w+', '', regex=True)        # Remove @mentions
df_test['text'] = df_test['text'].str.replace(r'#\w+', '', regex=True)        # Remove hashtags
df_test['text'] = df_test['text'].str.replace(r'http\S+', '', regex=True)     # Remove URLs
df_test['text'] = df_test['text'].str.replace(r'[^a-z\s]', '', regex=True)    # Remove punctuation/numbers
df_test['text'] = df_test['text'].str.replace(r'[\u00A0\u3000]', ' ', regex=True) # Remove special space types
df_test['text'] = df_test['text'].str.replace(r'\s+', ' ', regex=True)        # Normalize all whitespace
df_test['text'] = df_test['text'].str.strip()                                 # Final trim

# Optional checks (can remove later)
print(df_test['text'].apply(lambda x: x.startswith(' ') or x.endswith(' ')).sum())
print(df_test['text'].apply(lambda x: '  ' in x).sum())

# Tokenization
nltk.download('punkt')
df_test['tokens'] = df_test['text'].apply(word_tokenize)

# Stopword removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df_test['tokens'] = df_test['tokens'].apply(lambda tokens: [w for w in tokens if w not in stop_words])

# Lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token, pos='v') for token in tokens]
df_test['tokens'] = df_test['tokens'].apply(lemmatize_tokens)

# Join back into a cleaned string
df_test['clean_text'] = df_test['tokens'].apply(lambda x: ' '.join(x))


0
0


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/suzukikenta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [622]:
# Create a submission file
final_model = models[best_model]

X_final_test = vectorizer.transform(df_test['clean_text'])
y_test_pred = final_model.predict(X_final_test)
submission = df_test[['id']].copy()
submission['target'] = y_test_pred

submission.to_csv('submission.csv', index=False)
print("✅ Predictions saved to 'submission.csv'")

✅ Predictions saved to 'submission.csv'
