In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import string
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kandr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


### Preprocessing - 1:
    1. convert sentence to lower case.
    2. Remove numbers if any.
    3. Remove HTML tags
    4. Remove URLS in a sentence.
    5. Remove emojis and other symbols is any.
    6. Remove Punctuation marks.
    
   Apply the function preprocess for both train and test data.

In [4]:
def preprocess1(text):
    text=str(text).lower() #Converts text to lowercase
    text=re.sub('\d+', '', text) #removes numbers
    text=re.sub('\[.*?\]', '', text) #removes HTML tags
    text=re.sub('https?://\S+|www\.\S+', '', text) #removes url
    text=re.sub(r"["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", "", text) #removes emojis
    text=re.sub('[%s]' % re.escape(string.punctuation),'',text) #removes punctuation
    return text

In [5]:
df['clean_text']=df['text'].apply(preprocess1)
df_test['clean_text']=df_test['text'].apply(preprocess1)

df.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


### Stop Words:

One of the major forms of pre-processing is to filter out useless data. In natural language processing, useless words (data), are referred to as stop words.
Stop words contains words such as a, an, the, is, he, she etc.

In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop=set(stopwords.words('english'))
stop.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming:

Conversion of words to their root words.

Eg: closeness, closing -> close
    updated, updating -> updat
    
If you observe the root word 'updat' doesn't have meaning. The correct root word for updated might be 'update' for perfect meaning. So, to convert a word to meaningful root word, Lemmatization is used.

In [7]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
ps = PorterStemmer()

In [8]:
def stemming(text):
    stem_strings=list(map(lambda y: [ps.stem(word) for word in word_tokenize(y) if word not in stop],df['clean_text']))
    return stem_strings

In [9]:
stemming(df['clean_text'])

[['deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'us'],
 ['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada'],
 ['resid',
  'ask',
  'shelter',
  'place',
  'notifi',
  'offic',
  'evacu',
  'shelter',
  'place',
  'order',
  'expect'],
 ['peopl', 'receiv', 'wildfir', 'evacu', 'order', 'california'],
 ['got',
  'sent',
  'photo',
  'rubi',
  'alaska',
  'smoke',
  'wildfir',
  'pour',
  'school'],
 ['rockyfir',
  'updat',
  'california',
  'hwi',
  'close',
  'direct',
  'due',
  'lake',
  'counti',
  'fire',
  'cafir',
  'wildfir'],
 ['flood',
  'disast',
  'heavi',
  'rain',
  'caus',
  'flash',
  'flood',
  'street',
  'manit',
  'colorado',
  'spring',
  'area'],
 ['im', 'top', 'hill', 'see', 'fire', 'wood'],
 ['there', 'emerg', 'evacu', 'happen', 'build', 'across', 'street'],
 ['im', 'afraid', 'tornado', 'come', 'area'],
 ['three', 'peopl', 'die', 'heat', 'wave', 'far'],
 ['haha',
  'south',
  'tampa',
  'get',
  'flood',
  'hah',
  'wait',
  'second',
  'live',
  

### Preprocess - 2
    Lemmatize the words to their root words with the below function. The root words will have a proper meaning.

In [10]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kandr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
lemma=WordNetLemmatizer()
def preprocess2(text):
    final_text=text.apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split(' ') if word not in stop))
    return final_text

In [12]:
df['final']=preprocess2(df['clean_text'])
df_test['final']=preprocess2(df_test['clean_text'])

In [13]:
df.head()

Unnamed: 0,id,keyword,location,text,target,clean_text,final
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...,people receive wildfire evacuation order cali...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,got sent photo ruby alaska smoke wildfire pour...


In Data Frame, df['final'] has the pre-processed text which can be used for converting words to vectors.

#### Words to vectors: 
In this notebook, I am gonna implement 

1. Bag of Words.

2. TF-IDF: Term Frequency- Inverse Document Frequnecy.

3. Word2Vec


#### Converting Word to frequency count
1. Consider all disaster and non-disaster tweets separately, count the frequencies of words and convert to dictionary.

   Created 2 variables dis_freq and ndis_freq.


2. create a function (create_vector) that sums all the frequencies of words using both dis_freq and ndis_freq dictionaries.

    suppose dis_freq={A: 1, man: 1}, ndis_freq={happy:3}. Consider a word 'A happy man' 
    The vector formed is [1+0+1, 0+3+0] = [2,3]


In [14]:
global dis_freq, ndis_freq
dis_freq=df.loc[df['target']==1, 'final'].str.split(expand=True).stack().value_counts().to_dict()
ndis_freq=df.loc[df['target']==0, 'final'].str.split(expand=True).stack().value_counts().to_dict()

In [15]:
def create_vector(tweet):
    total_dis =0
    total_ndis =0
    for word in tweet.split(' '):
        total_dis+=dis_freq[word] if word in dis_freq.keys() else 0
        total_ndis+=ndis_freq[word] if word in ndis_freq.keys() else 0 
    return [total_dis, total_ndis]


In [16]:
vector=df['final'].apply(create_vector)
vector2=df_test['final'].apply(create_vector)

In [17]:
df1 = pd.DataFrame(vector.values.tolist()).add_prefix('data')
df2 = pd.DataFrame(vector2.values.tolist()).add_prefix('data')
print(df1)

      data0  data1
0       220    217
1       392    119
2       159     72
3       366    126
4       192    208
...     ...    ...
7608    318    118
7609    567    210
7610     60     11
7611    295    181
7612    604    118

[7613 rows x 2 columns]


In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score

In [58]:
def train_model(model,X,y, test):
    X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)    
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(classification_report(y_test,y_pred))
    print('precision',precision_score(y_test,y_pred))
    print('recall',recall_score(y_test,y_pred))
    
    return model.predict(test)

In [20]:
X=df1
y=df['target']

In [21]:
lr = LogisticRegression()
y_pred=train_model(lr,X,y,df2)

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1326
           1       0.79      0.70      0.74       958

    accuracy                           0.80      2284
   macro avg       0.80      0.78      0.79      2284
weighted avg       0.80      0.80      0.79      2284



### Important Terminilogies.
1. Corpus: Corpus is considered as entire Dataset.
2. Document: It is a particular sentence or text. Consider as a rows of text in the data set.
3. Vocabulary: Total unique words in the data set.

### Bag of Words (BOW):
1. It creates a vector with length of vocabulary and arrange in a descending order.
2. Places 1 for the word in that document else 0.

Eg: Consider a corpus containing a data after preprocessing:

D1: Good girl

D2: Good boy

D3: good boy girl

Vocabulary : 3 (good boy girl)

Frequencies: good-3, boy-2, girl-2. So the vector is arranged as:

    f1(good)   f2(boy)  f3(girl)
    
D1:   1          0          1 => [1,0,1]

D2:   1          1          0 => [1,1,0]

D3:   1          1          1 => [1,1,1]

Disadvantages: 
1. No semantic meaning is caputred.

2. Creates a sparse matrix. If length of vocabulary increases, the vector sizes increases.

3. Ordering of words are changed. Since the vector forms based on high frequency to low frequency.




In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features = 2500, binary=True)
# Max-features - vector length
X = cv.fit_transform(df['final']).toarray()
X_test = cv.transform(df_test['final']).toarray()

In [23]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)

In [24]:
y_pred=train_model(lr,X,y,X_test)

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      1326
           1       0.78      0.69      0.74       958

    accuracy                           0.79      2284
   macro avg       0.79      0.78      0.78      2284
weighted avg       0.79      0.79      0.79      2284



### TF-IDF (Term Frequency - Inverse Document Frequency):
The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. 

In general, TF - captures a word and given more weightage if it occurs rarely in the corpus. IDF - Gives less preference to the word that occurs in almost in every document of the corpus.

    TF of sentence= No.of repeated words in a sentence /Total no.of words in sentence.
    IDF of word = log(No.of sentences/no. of sentences that word contain)
Vector is calculated as TF*IDF
    
Consider the above example:

        f1(good) f2(boy) f3(girl)
 good girl D1:    [1/2*log(3/3)  0/2*log(3/2)  1/2*log(3/2)]
 
good boy D2:      [1/2*log(3/3)  1/2*log(3/2)   0/2*log(3/2)]

good boy girl D3: [1/2*log(3/3)  1/3*log(3/2)   1/2*log(3/2)]

If you observe above vector, The word 'good' is present in all documents and hence the value given is 0. It is given the less preference.

Disadvantages:
Semantic meaning is not yet captured.
Sparsity is still present.

In [25]:
#TD_IDF
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X_tdidf = cv.fit_transform(df['final'])
X_tdidf_test = cv.transform(df_test['final'])

In [26]:
X_tdidf[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [60]:
y_pred=train_model(lr,X_tdidf,y,X_tdidf_test)

              precision    recall  f1-score   support

           0       0.78      0.92      0.84      1326
           1       0.85      0.64      0.73       958

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.79      2284
weighted avg       0.81      0.80      0.79      2284

precision 0.8505586592178771
recall 0.6356993736951984


#### Word2Vector:

In [28]:
# !pip install gensim
from gensim.models import Word2Vec

In [29]:
total_words = list(map(lambda sent: word_tokenize(sent),df['final']))
total_words_test = list(map(lambda sent: word_tokenize(sent),df_test['final']))

In [44]:
model=Word2Vec(total_words, window=5)

In [45]:
# model.wv.index_to_key
model.corpus_count

7613

In [46]:
model.wv.similar_by_word('street')

[('amp', 0.9991817474365234),
 ('u', 0.9991761445999146),
 ('storm', 0.9991061687469482),
 ('day', 0.9990919828414917),
 ('not', 0.9990911483764648),
 ('someone', 0.9990891814231873),
 ('say', 0.999086856842041),
 ('wind', 0.9990864992141724),
 ('via', 0.9990862011909485),
 ('\x89ûò', 0.9990851283073425)]

In [47]:
def avgword2vec(doc):
    lists=[model.wv[word] for word in doc if word in model.wv.index_to_key]
    if len(lists)>0:
        means =np.mean(lists, axis=0)
        return means
    return np.zeros(100)
    


In [48]:
final_vec=list(map(lambda doc: avgword2vec(doc),total_words))

In [49]:
final_vec_test=list(map(lambda doc: avgword2vec(doc),total_words_test))

In [50]:
X_new = np.array(final_vec)
X_new_test = np.array(final_vec_test)

In [51]:
X_new

array([[-0.2861709 ,  0.24945016, -0.02500435, ..., -0.25537044,
        -0.00063601,  0.11543377],
       [-0.28170815,  0.25551376, -0.02511886, ..., -0.24936   ,
        -0.00708965,  0.11432289],
       [-0.13443178,  0.12052635, -0.01578406, ..., -0.11884652,
        -0.00460241,  0.05447289],
       ...,
       [-0.15407756,  0.14342102, -0.01848608, ..., -0.13683493,
         0.00483308,  0.06332649],
       [-0.22255284,  0.19423985, -0.02482181, ..., -0.19720508,
        -0.00419431,  0.0855167 ],
       [-0.30226293,  0.29206857, -0.02407596, ..., -0.27249923,
        -0.01268911,  0.1314811 ]])

In [52]:
y_pred=train_model(lr,X_new,y,X_new_test)

              precision    recall  f1-score   support

           0       0.60      0.96      0.74      1326
           1       0.67      0.12      0.20       958

    accuracy                           0.61      2284
   macro avg       0.64      0.54      0.47      2284
weighted avg       0.63      0.61      0.51      2284



In [72]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
y_pred=train_model(mnb,X_tdidf,y,X_tdidf_test)

              precision    recall  f1-score   support

           0       0.78      0.92      0.84      1326
           1       0.85      0.65      0.74       958

    accuracy                           0.80      2284
   macro avg       0.82      0.78      0.79      2284
weighted avg       0.81      0.80      0.80      2284

precision 0.8485675306957708
recall 0.6492693110647182


In [77]:
from sklearn.ensemble import GradientBoostingClassifier
rf = GradientBoostingClassifier(max_depth=20, min_samples_leaf=3, min_samples_split=3)
y_pred = train_model(rf, X_tdidf,y,X_tdidf_test)

              precision    recall  f1-score   support

           0       0.77      0.89      0.82      1326
           1       0.80      0.63      0.70       958

    accuracy                           0.78      2284
   macro avg       0.78      0.76      0.76      2284
weighted avg       0.78      0.78      0.77      2284

precision 0.8024032042723631
recall 0.627348643006263


In [40]:
submission = df_test[['id']].reset_index(drop=True)
submission['target'] = y_pred

In [41]:
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [42]:
submission.to_csv('submission.csv', index=False)

In [43]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
