In [47]:
# Import necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import math
import time

# Sklearn libraries for machine learning and text processing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# NLTK libraries for text processing (lemmatization, stemming, stopwords, POS tagging)
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Set up visualization
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# Download necessary NLTK resources for text processing
nltk.download('wordnet')  # WordNet for lemmatization
nltk.download('omw-1.4')  # Open Multilingual Wordnet
nltk.download('punkt')  # Tokenizer
nltk.download('stopwords')  # Stopwords for text cleaning
nltk.download('averaged_perceptron_tagger')  # POS tagger for part-of-speech tagging
nltk.download('tagsets_json')  # Tagset resource

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets_json to /root/nltk_data...
[nltk_data]   Package tagsets_json is already up-to-date!


True

In [48]:
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

In [49]:
df_train.head(70)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
65,93,ablaze,Birmingham,@nxwestmidlands huge fire at Wholesale markets...,1
66,95,ablaze,San Francisco,@ablaze what time does your talk go until? I d...,0
67,96,accident,CLVLND,'I can't have kids cuz I got in a bicycle acci...,0
68,97,accident,"Nashville, TN",Accident on I-24 W #NashvilleTraffic. Traffic ...,1


In [50]:
df_train.shape

(7613, 5)

In [51]:
df_train = df_train.drop(columns=['location'])

In [52]:
df_train.shape

(7613, 4)

In [53]:
df_train['keyword'] = df_train['keyword'].fillna('')

In [54]:
# Combine 'keyword' and 'text' for better feature representation
df_train['combined_text'] = df_train['keyword'] + ' ' + df_train['text']
df_train.head()

Unnamed: 0,id,keyword,text,target,combined_text
0,1,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake ...
1,4,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are...
3,6,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation o..."
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as...


In [55]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
import string
from nltk.corpus import stopwords

In [57]:
def text_process(text):
  text = text.lower()
  if not isinstance(text, str):
        return ""
  STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
  nopunc = ''.join([char for char in text if char not in string.punctuation])
  return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])


In [58]:
df_train['combined_text'] = df_train['text'].apply(text_process)
df_train.head()

Unnamed: 0,id,keyword,text,target,combined_text
0,1,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us
1,4,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,6,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...


In [59]:
x = df_train['combined_text']
y = df_train['target']

In [60]:
x

Unnamed: 0,combined_text
0,deeds reason earthquake may allah forgive us
1,forest fire near la ronge sask canada
2,residents asked shelter place notified officer...
3,13000 people receive wildfires evacuation orde...
4,got sent photo ruby alaska smoke wildfires pou...
...,...
7608,two giant cranes holding bridge collapse nearb...
7609,ariaahrary thetawniest control wild fires cali...
7610,m194 0104 utc5km volcano hawaii httptcozdtoyd8ebj
7611,police investigating ebike collided car little...


In [61]:
y

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1
...,...
7608,1
7609,1
7610,1
7611,1


In [62]:
x_train, x_val, y_train, y_val= train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y)

In [63]:
x_train

Unnamed: 0,combined_text
6234,sassy city girl country hunk stranded smoky mo...
326,gods kingdom heavenly govt rule people earth a...
997,mopheme bigstar johnson problem game body bagg...
7269,vixmeldrew sounds like whirlwind life
2189,malaysia confirms plane debris washed reunion ...
...,...
3386,ûïa voluntary evacuation recommended timeû ...
3280,rt calestous tanzania elephant population decl...
305,pbban temporary300 russaky89 armageddon kill f...
1648,petition heartless owner whipped horse collaps...


In [64]:
x_val

Unnamed: 0,combined_text
4863,theeconomist step one get mass murderers portr...
1370,ted cruz fires back jeb amp bush ûïwe lose re...
3521,ûïlittle boyû affected people hiroshima ûò...
178,ambulance sprinter automatic frontline vehicle...
5859,cause play like symphony play till fingers ble...
...,...
6939,princeoffencing frickin summer humidity buildi...
2074,emmerdale ross really dead askcharley
3186,bc costs less sick people using emergency room...
4297,hellfireev jackperu1 one


In [65]:
y_train

Unnamed: 0,target
6234,1
326,0
997,0
7269,0
2189,1
...,...
3386,1
3280,1
305,0
1648,0


In [66]:
vectorizer = TfidfVectorizer(
    max_features = 20000,
    ngram_range = (1,2),
    min_df = 2,
    max_df = 0.95
)

In [67]:
X_train_tfidf = vectorizer.fit_transform(x_train)
X_val_tfidf = vectorizer.transform(x_val)

print("\n TF-IDF Vectorizer vocabulary size: ", len(vectorizer.get_feature_names_out()))
print("Sample features: ", vectorizer.get_feature_names_out()[:20])


 TF-IDF Vectorizer vocabulary size:  9741
Sample features:  ['001116' '001116 utc20150805' '005225' '005225 utc20150805' '0104'
 '0104 utc5km' '010401' '010401 utc20150805' '02' '02 scene' '05'
 '05 1038pm' '06' '06 ani' '075' '0800' '09' '0day' '0day bug' '10']


In [68]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [69]:
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
clf.fit(X_train_tfidf, y_train)
#vectorizer = CountVectorizer(binary=True)
#x_train_dtm = vectorizer.fit_transform(x_train)
#x_val_dtm = vectorizer.transform(x_val)
#print("Shape of x_train_dtm:", x_train_dtm.shape)
#print("Type of x_train_dtm:", type(x_train_dtm))



In [70]:
#clf = make_pipeline(StandardScaler(with_mean= False),
# SGDClassifier(max_iter=1000, tol=1e-3))
#clf.fit(x_train, y_train)

In [71]:
clf.predict(X_val_tfidf)

array([0, 0, 1, ..., 1, 0, 0])

In [72]:
# prompt: evaluate using metric and give a percentage score over 100

score = clf.score(X_val_tfidf, y_val)
percentage_score = score * 100
print(f"Model score: {percentage_score:.2f}%")
#percentage_score = score * 100
#print(f"Model score: {percentage_score:.2f}%")

Model score: 80.50%
