In [27]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.pipeline import Pipeline
warnings.filterwarnings("ignore")


In [28]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [29]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [30]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [31]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [32]:
df_train.isnull().sum(), df_test.isnull().sum() 

(id             0
 keyword       61
 location    2533
 text           0
 target         0
 dtype: int64,
 id             0
 keyword       26
 location    1105
 text           0
 dtype: int64)

In [33]:
df_train.drop(['location','keyword','id'],axis=1,inplace=True)
df_test.drop(['location','keyword','id'],axis=1,inplace=True)

In [34]:
df_train

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [35]:
df_test

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


#### First level of text cleanup removing special characters and unwanted symbols and lowering every word using Regex.

In [36]:
def remove_symbols_low(dfcolumns):
    dfcolumns = dfcolumns.str.replace(r'\W',' ')
    dfcolumns = dfcolumns.str.lower()
    return dfcolumns

In [37]:
df_test.text = remove_symbols_low(df_test.text)
df_test

Unnamed: 0,text
0,just happened a terrible car crash
1,heard about earthquake is different cities s...
2,there is a forest fire at spot pond geese are...
3,apocalypse lighting spokane wildfires
4,typhoon soudelor kills 28 in china and taiwan
...,...
3258,earthquake safety los angeles ûò safety faste...
3259,storm in ri worse than last hurricane my city...
3260,green line derailment in chicago http t co u...
3261,meg issues hazardous weather outlook hwo htt...


In [38]:
df_train.text = remove_symbols_low(df_train.text)
df_train

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake m...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are ...,1
3,13 000 people receive wildfires evacuation or...,1
4,just got sent this photo from ruby alaska as ...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,aria_ahrary thetawniest the out of control w...,1
7610,m1 94 01 04 utc 5km s of volcano hawaii htt...,1
7611,police investigating after an e bike collided ...,1


In [39]:
df_train.loc[5 ,"text"]

' rockyfire update    california hwy  20 closed in both directions due to lake county fire    cafire  wildfires'

In [43]:
def rem_non(data):
    data = re.sub('\n',' ',data)
    data = re.sub('\t',' ',data)
    data = re.sub('https?://\S+|www\.\S+',' ',data)
    return data
remnon = lambda x: rem_non(x)

In [44]:
df_test.text = df_test.text.apply(remnon)
df_train.text =df_train.text.apply(remnon)

In [46]:
df_train.loc[5 ,"text"]

' rockyfire update    california hwy  20 closed in both directions due to lake county fire    cafire  wildfires'

#### Second level of cleaning

###### Cleaning digits 

In [47]:
def num_remove(data):
    data = re.sub('[^a-zA-Z]',' ',data)
    data = data.split()
    return data
df_test.text = df_test.text.apply(num_remove)
df_test

Unnamed: 0,text
0,"[just, happened, a, terrible, car, crash]"
1,"[heard, about, earthquake, is, different, citi..."
2,"[there, is, a, forest, fire, at, spot, pond, g..."
3,"[apocalypse, lighting, spokane, wildfires]"
4,"[typhoon, soudelor, kills, in, china, and, tai..."
...,...
3258,"[earthquake, safety, los, angeles, safety, fas..."
3259,"[storm, in, ri, worse, than, last, hurricane, ..."
3260,"[green, line, derailment, in, chicago, http, t..."
3261,"[meg, issues, hazardous, weather, outlook, hwo..."


In [48]:
#apply it on train test aswell
df_train['text'] = df_train['text'].apply(num_remove)
df_train

Unnamed: 0,text,target
0,"[our, deeds, are, the, reason, of, this, earth...",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[all, residents, asked, to, shelter, in, place...",1
3,"[people, receive, wildfires, evacuation, order...",1
4,"[just, got, sent, this, photo, from, ruby, ala...",1
...,...,...
7608,"[two, giant, cranes, holding, a, bridge, colla...",1
7609,"[aria, ahrary, thetawniest, the, out, of, cont...",1
7610,"[m, utc, km, s, of, volcano, hawaii, http, t, ...",1
7611,"[police, investigating, after, an, e, bike, co...",1


In [49]:
df_train.loc[5 ,"text"]

['rockyfire',
 'update',
 'california',
 'hwy',
 'closed',
 'in',
 'both',
 'directions',
 'due',
 'to',
 'lake',
 'county',
 'fire',
 'cafire',
 'wildfires']

###### through the above method we have tokenize the sentences too.

### importing important specific natural language libraries

In [50]:
import nltk
import gensim
import normalise
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer ,CountVectorizer

In [53]:
#Calling instances of each APIs.
stem = PorterStemmer()
Wordnetlem = WordNetLemmatizer()
countVect = CountVectorizer()
tfidfVect = TfidfVectorizer()

In [84]:
# Removing stop words
sw = stopwords.words('english')
wordsetexc = ['asap','gonna','wanna','bro','lit','more','most','far','cheeky',
              'affair','uff','ceeya','on','in','and','wow','whoah',
              'wooww','astounding','no','yes','km','mile','god','allah','jesus','jesuschrist',
             'oh','oh god','etc','damn','needless','fine','http','www','website','dollar','co',
             'ltd','unit','union','got','heard','nope','little','us','we','our','almighty',
             'might','ought','buy','sell','sent','invite']
for i in wordsetexc:
    sw.append(i)
def remove_sw(data):
    data = [word for word in data if word not in sw]
    return data

In [85]:
df_test.text = df_test.text.apply(lambda x: remove_sw(x))
df_train.text = df_train.text.apply(lambda x: remove_sw(x))

In [86]:
df_test

Unnamed: 0,text
0,"[happened, terrible, car, crash]"
1,"[earthquake, different, city, stay, safe, ever..."
2,"[forest, fire, spot, pond, goose, fleeing, acr..."
3,"[apocalypse, lighting, spokane, wildfire]"
4,"[typhoon, soudelor, kill, china, taiwan]"
...,...
3258,"[earthquake, safety, los, angeles, safety, fas..."
3259,"[storm, ri, worse, last, hurricane, city, amp,..."
3260,"[green, line, derailment, chicago, utbxlcbiuy]"
3261,"[meg, issue, hazardous, weather, outlook, hwo,..."


In [87]:
df_train

Unnamed: 0,text,target
0,"[deed, reason, earthquake, may, forgive]",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[resident, asked, shelter, place, notified, of...",1
3,"[people, receive, wildfire, evacuation, order,...",1
4,"[photo, ruby, alaska, smoke, wildfire, pours, ...",1
...,...,...
7608,"[two, giant, crane, holding, bridge, collapse,...",1
7609,"[aria, ahrary, thetawniest, control, wild, fir...",1
7610,"[utc, volcano, hawaii, zdtoyd, ebj]",1
7611,"[police, investigating, e, bike, collided, car...",1


#### Now moving into some intermediate level of feature engineering part in NLP.

#### Lemmatization of words to convert it into its root words.

In [88]:
def lemmatize(data):
    data = [Wordnetlem.lemmatize(word) for word in data]
    return data

In [89]:
df_train["text"] = df_train["text"].apply(lambda x: lemmatize(x))

In [90]:
df_train

Unnamed: 0,text,target
0,"[deed, reason, earthquake, may, forgive]",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[resident, asked, shelter, place, notified, of...",1
3,"[people, receive, wildfire, evacuation, order,...",1
4,"[photo, ruby, alaska, smoke, wildfire, pours, ...",1
...,...,...
7608,"[two, giant, crane, holding, bridge, collapse,...",1
7609,"[aria, ahrary, thetawniest, control, wild, fir...",1
7610,"[utc, volcano, hawaii, zdtoyd, ebj]",1
7611,"[police, investigating, e, bike, collided, car...",1


In [91]:
df_test["text"] = df_test["text"].apply(lambda x: lemmatize(x))
df_test

Unnamed: 0,text
0,"[happened, terrible, car, crash]"
1,"[earthquake, different, city, stay, safe, ever..."
2,"[forest, fire, spot, pond, goose, fleeing, acr..."
3,"[apocalypse, lighting, spokane, wildfire]"
4,"[typhoon, soudelor, kill, china, taiwan]"
...,...
3258,"[earthquake, safety, los, angeles, safety, fas..."
3259,"[storm, ri, worse, last, hurricane, city, amp,..."
3260,"[green, line, derailment, chicago, utbxlcbiuy]"
3261,"[meg, issue, hazardous, weather, outlook, hwo,..."


#### Tokenizing words in the data set

### Creating a data processing check point

In [92]:
traindata = df_train.copy()
testdata = df_test.copy()

In [93]:
def joinwords(data):
    combined_text = ' '.join(data)
    return combined_text
combined = lambda x: joinwords(x)

In [94]:
traindata["text"] = traindata["text"].apply(combined)

In [95]:
testdata["text"] = testdata["text"].apply(combined)

In [96]:
traindata

Unnamed: 0,text,target
0,deed reason earthquake may forgive,1
1,forest fire near la ronge sask canada,1
2,resident asked shelter place notified officer ...,1
3,people receive wildfire evacuation order calif...,1
4,photo ruby alaska smoke wildfire pours school,1
...,...,...
7608,two giant crane holding bridge collapse nearby...,1
7609,aria ahrary thetawniest control wild fire cali...,1
7610,utc volcano hawaii zdtoyd ebj,1
7611,police investigating e bike collided car portu...,1


In [97]:
testdata

Unnamed: 0,text
0,happened terrible car crash
1,earthquake different city stay safe everyone
2,forest fire spot pond goose fleeing across str...
3,apocalypse lighting spokane wildfire
4,typhoon soudelor kill china taiwan
...,...
3258,earthquake safety los angeles safety fastener ...
3259,storm ri worse last hurricane city amp others ...
3260,green line derailment chicago utbxlcbiuy
3261,meg issue hazardous weather outlook hwo x rbqjhn


### Converting text to Vectors /// Embedding.

##### seperating features and Target label.

In [98]:
X = traindata["text"]
X

0                      deed reason earthquake may forgive
1                   forest fire near la ronge sask canada
2       resident asked shelter place notified officer ...
3       people receive wildfire evacuation order calif...
4           photo ruby alaska smoke wildfire pours school
                              ...                        
7608    two giant crane holding bridge collapse nearby...
7609    aria ahrary thetawniest control wild fire cali...
7610                        utc volcano hawaii zdtoyd ebj
7611    police investigating e bike collided car portu...
7612    latest home razed northern california wildfire...
Name: text, Length: 7613, dtype: object

In [99]:
y = df_train.target
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

##### Introducing testing set as X-test.

In [100]:
X_test = testdata.text
X_test

0                             happened terrible car crash
1            earthquake different city stay safe everyone
2       forest fire spot pond goose fleeing across str...
3                    apocalypse lighting spokane wildfire
4                      typhoon soudelor kill china taiwan
                              ...                        
3258    earthquake safety los angeles safety fastener ...
3259    storm ri worse last hurricane city amp others ...
3260             green line derailment chicago utbxlcbiuy
3261     meg issue hazardous weather outlook hwo x rbqjhn
3262    cityofcalgary activated municipal emergency pl...
Name: text, Length: 3263, dtype: object

#### importing Sklearn Model_selection API

In [101]:
from sklearn.model_selection import train_test_split ,cross_val_score

In [102]:
# We can intially use train_test_split method.
X_train, X_val, y_train, y_val = train_test_split(X,y ,test_size=0.3)

#### Model with Naive-Bayes MultiNomial NB its the most basic ML algo approach for this sort of task.


In [103]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [104]:
multNB = MultinomialNB()

#### Initially we can deploy a Pipeline and if further model improvemnt is needed we can move onto some specific stepwise precision approach.

In [125]:
model = Pipeline([
     ('vectorizer',TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))),
     ('classifier',MultinomialNB())
])

In [126]:
model.fit(X_train,y_train)
score = model.score(X_val,y_val)
score

0.7981611208406305

In [127]:
pred = model.predict(X_val)

In [128]:
pred

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [129]:
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score

In [130]:
confusion_matrix(pred,y_val)

array([[1173,  364],
       [  97,  650]], dtype=int64)

In [131]:
f1_score(pred,y_val)

0.7382169222032936

### As the Accuracy is bit low we can move on to threshold adjustment or AUC_ROC correwction method

In [132]:
predproba = model.predict_proba(X_val)     #[:,1]
predproba

array([[0.71890925, 0.28109075],
       [0.86924234, 0.13075766],
       [0.8665356 , 0.1334644 ],
       ...,
       [0.8467681 , 0.1532319 ],
       [0.90121052, 0.09878948],
       [0.28253625, 0.71746375]])

In [133]:
threshold = model.predict_proba(X_val)[:,1]
threshold

array([0.28109075, 0.13075766, 0.1334644 , ..., 0.1532319 , 0.09878948,
       0.71746375])

###### reviewing accuracy

In [134]:
accuracy_ls = []
for thres in threshold:
    y_pred_review = np.where(threshold > thres ,1,0)
    score = accuracy_score(y_pred_review,y_val ,normalize=True)
    accuracy_ls.append(score)
thres_series = pd.Series(threshold)
accuracy_series = pd.Series(accuracy_ls)
fin_accuracy = pd.concat((thres_series,accuracy_series) ,axis=1)
fin_accuracy.columns = ['threshold','accuracy']
fin_accuracy.sort_values(by='accuracy',ascending=False,inplace=True)

In [135]:
fin_accuracy

Unnamed: 0,threshold,accuracy
1881,0.452984,0.807793
94,0.451963,0.807356
1686,0.451905,0.806918
623,0.453431,0.806918
2257,0.453431,0.806918
...,...,...
1910,0.013775,0.446147
457,0.013775,0.446147
1127,0.008560,0.445271
2228,0.007720,0.444834


#### Final Adjusted prediction

In [136]:
final_adj_pred = np.where(threshold > 0.523374 ,1,0)
final_adj_pred

array([0, 0, 0, ..., 0, 0, 1])

In [137]:
confusion_matrix(final_adj_pred,y_val)

array([[1195,  393],
       [  75,  621]], dtype=int64)

In [138]:
f1_score(final_adj_pred,y_val)

0.7263157894736842

In [139]:
scoreadj = accuracy_score(final_adj_pred,y_val)
scoreadj

0.7950963222416813

##### Eventhough its only a slight improvement in the score its satisfialbe for prediction.

In [140]:
final_test_data_pred = model.predict_proba(X_test)[:,1]
fin_adj_pred = (final_test_data_pred > 0.523374 ).astype(int)
fin_adj_pred

array([1, 0, 1, ..., 1, 1, 1])

### Creating a submission CSV file

In [141]:
dfsubmtest = pd.read_csv("test.csv")

In [142]:
submission =pd.DataFrame({'id':dfsubmtest.id,
                         'target':fin_adj_pred})
submission.set_index('id',inplace=True)

In [143]:
submission

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,0
3,1
9,1
11,1
...,...
10861,1
10865,0
10868,1
10874,1


In [144]:
submission.to_csv("submission.csv")

## This is initial Submit more model improvement and feature engineering is on the way.

In [146]:
s = tfidfVect.fit_transform(df_train)
