#### step1: import the proper packages to the notebook environment

In [128]:
#import packages we may need
import pandas as pd
import numpy as np
from pprint import pprint
import re

# Set seed for reproducibility
import random; random.seed(53)

#import some specific NPL packages
import nltk

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

#### step2: load the test and train dataset

In [2]:
data_train=pd.read_csv('./train.csv')
data_test=pd.read_csv('./test.csv')

In [3]:
data_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
data_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


#### step3: descriptive exploration of the 2 datasets

In [5]:
# function to extract the main features
def data_set_exploration(dataset):
    print(dataset.shape)
    print(dataset.columns)
    print('\n')
    print(dataset.isnull().sum())

In [6]:
data_set_exploration(data_train)

(7613, 5)
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [7]:
#there are 7613 tweets in the train database with 61 keyword missing and 2533 location missing

In [8]:
data_set_exploration(data_test)

(3263, 4)
Index(['id', 'keyword', 'location', 'text'], dtype='object')


id             0
keyword       26
location    1105
text           0
dtype: int64


In [9]:
#there are 3263 tweets in the test database with 26 keyword missing and 1103 location missing

In [10]:
#the ratio of missing data in the 2 datasets are roughly the same: good!

In [11]:
# we create a function that fill missing values with most present value
def my_fill_na_function(dataset):
    for my_column in dataset.columns:
        max_value= dataset[my_column].value_counts().index[0]
        dataset[my_column]=dataset[my_column].fillna(max_value)
        print('the max_value of column %s is %s' %(my_column,max_value))

In [12]:
my_fill_na_function(data_test)

the max_value of column id is 10235
the max_value of column keyword is deluged
the max_value of column location is New York
the max_value of column text is 11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...


In [13]:
data_test.isnull().sum()

id          0
keyword     0
location    0
text        0
dtype: int64

In [14]:
my_fill_na_function(data_train)

the max_value of column id is 2047
the max_value of column keyword is fatalities
the max_value of column location is USA
the max_value of column text is 11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...
the max_value of column target is 0


In [15]:
# we create a function that replaces the non alpha characters from the strings in the database befor next steps

def my_remove_nonalpha(dataset):
    for my_col in ['keyword','location','text']:
        my_pattern=re.compile('[^A-Za-z]+')
        dataset[my_col]=[my_pattern.sub(' ',my_text) for my_text in dataset[my_col]]

In [16]:
my_remove_nonalpha(data_train)
my_remove_nonalpha(data_test)

In [17]:
# we concatenate horizonatally the 2 databases
my_data_all=pd.concat([data_train,data_test], axis=0).reset_index()
data_set_exploration(my_data_all)

(10876, 6)
Index(['index', 'id', 'keyword', 'location', 'target', 'text'], dtype='object')


index          0
id             0
keyword        0
location       0
target      3263
text           0
dtype: int64


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [18]:
my_data_all.head()

Unnamed: 0,index,id,keyword,location,target,text
0,0,1,fatalities,USA,1.0,Our Deeds are the Reason of this earthquake Ma...
1,1,4,fatalities,USA,1.0,Forest fire near La Ronge Sask Canada
2,2,5,fatalities,USA,1.0,All residents asked to shelter in place are be...
3,3,6,fatalities,USA,1.0,people receive wildfires evacuation orders in...
4,4,7,fatalities,USA,1.0,Just got sent this photo from Ruby Alaska as s...


In [19]:
my_data_all[my_data_all['target'].isna()==True].head()

Unnamed: 0,index,id,keyword,location,target,text
7613,0,0,deluged,New York,,Just happened a terrible car crash
7614,1,2,deluged,New York,,Heard about earthquake is different cities sta...
7615,2,3,deluged,New York,,there is a forest fire at spot pond geese are ...
7616,3,9,deluged,New York,,Apocalypse lighting Spokane wildfires
7617,4,11,deluged,New York,,Typhoon Soudelor kills in China and Taiwan


#### step4: start the NLP transformation of the database content

In [20]:
# we are going to tokenize and stem the 'keyword' columns of the test and train datasets merge in my_data_all
from nltk.stem import WordNetLemmatizer 
#from nltk.stem import PorterStemmer
#porter=PorterStemmer()
lemma = WordNetLemmatizer()
keyword_split=[my_keyword.split() for my_keyword in my_data_all['keyword']]
new_keyword_split=pd.DataFrame(keyword_split)

my_row_max=new_keyword_split.shape[0]
my_col_max=new_keyword_split.shape[1]
my_count=0
for my_col in range(my_col_max):
    new_keyword_split.iloc[:,my_col].astype(str)
    for my_row in range(my_row_max):
        if new_keyword_split.iloc[my_row,my_col] is not None:
            try: 
                new_keyword_split.iloc[my_row,my_col]=lemma.lemmatize(new_keyword_split.iloc[my_row,my_col])
            except: 
                my_count+=1
print(my_count)

0


In [21]:
# we now have a reduced amount of keywords that have been stemmed and split into 4 columns
new_keyword_split.iloc[:,0].value_counts()

fatality      161
emergency     150
body          150
suicide       150
building      100
             ... 
rescue         33
threat         16
inundation     14
radiation      14
epicentre      13
Name: 0, Length: 201, dtype: int64

In [22]:
new_keyword_split.shape

(10876, 3)

In [23]:
# we join the 4 columns in one sole column
new_keyword_split['final_kw']=new_keyword_split.iloc[:,0:3].apply(lambda x: None if x.isnull().all() else ' '.join(x.dropna()), axis=1)

# we only keep the first column
my_new_keyword=new_keyword_split['final_kw']

#how many individual items?: 166
my_new_keyword.value_counts()

fatality               161
weapon                 100
injury                 100
siren                  100
body bag               100
                      ... 
battle                  33
threat                  16
radiation emergency     14
inundation              14
epicentre               13
Name: final_kw, Length: 211, dtype: int64

In [24]:
# we are going to tokenize and stem the 'text' columns of the test and train datasets merge in my_data_all
from nltk.stem import WordNetLemmatizer 
#from nltk.stem import PorterStemmer
#porter=PorterStemmer()
lemma = WordNetLemmatizer()
text_split=[my_text.split() for my_text in my_data_all['text']]
new_text_split=pd.DataFrame(text_split)

my_row_max=new_text_split.shape[0]
my_col_max=new_text_split.shape[1]
my_count=0
for my_col in range(my_col_max):
    new_text_split.iloc[:,my_col].astype(str)
    for my_row in range(my_row_max):
        if new_text_split.iloc[my_row,my_col] is not None:
            try: 
                new_text_split.iloc[my_row,my_col]=lemma.lemmatize(new_text_split.iloc[my_row,my_col])
            except: 
                my_count+=1
print(my_count)

0


In [25]:
new_text_split.shape

(10876, 33)

In [26]:
# we join all the columns in one sole column
new_text_split['final_text']=new_text_split.apply(lambda x: None if x.isnull().all() else ' '.join(x.dropna()), axis=1)

# we only keep the first column
my_new_text=new_text_split['final_text']

In [27]:
my_new_text.head()

0    Our Deeds are the Reason of this earthquake Ma...
1                Forest fire near La Ronge Sask Canada
2    All resident asked to shelter in place are bei...
3    people receive wildfire evacuation order in Ca...
4    Just got sent this photo from Ruby Alaska a sm...
Name: final_text, dtype: object

In [28]:
my_new_text[7614]

'Heard about earthquake is different city stay safe everyone'

In [29]:
my_new_text.shape

(10876,)

In [32]:
#we create the new dataframe
my_data_all.loc[:,'new_kw']=my_new_keyword
my_data_all.loc[:,'new_text']=my_new_text
my_data_all=my_data_all.drop(['keyword','text'], axis=1)

In [33]:
my_data_all.head()

Unnamed: 0,index,id,location,target,new_kw,new_text
0,0,1,USA,1.0,fatality,Our Deeds are the Reason of this earthquake Ma...
1,1,4,USA,1.0,fatality,Forest fire near La Ronge Sask Canada
2,2,5,USA,1.0,fatality,All resident asked to shelter in place are bei...
3,3,6,USA,1.0,fatality,people receive wildfire evacuation order in Ca...
4,4,7,USA,1.0,fatality,Just got sent this photo from Ruby Alaska a sm...


In [34]:
my_data_all[my_data_all['target'].isna()==True].head()

Unnamed: 0,index,id,location,target,new_kw,new_text
7613,0,0,New York,,deluged,Just happened a terrible car crash
7614,1,2,New York,,deluged,Heard about earthquake is different city stay ...
7615,2,3,New York,,deluged,there is a forest fire at spot pond goose are ...
7616,3,9,New York,,deluged,Apocalypse lighting Spokane wildfire
7617,4,11,New York,,deluged,Typhoon Soudelor kill in China and Taiwan


In [35]:
#we save the tokenized and stemmed database
my_data_all.to_csv('./my_data_all_lemma.csv')

In [94]:
#we load back the database
my_data_all_clean=pd.read_csv('./my_data_all_lemma.csv', index_col=0)

In [95]:
my_data_all_clean.head()

Unnamed: 0,index,id,location,target,new_kw,new_text
0,0,1,USA,1.0,fatality,Our Deeds are the Reason of this earthquake Ma...
1,1,4,USA,1.0,fatality,Forest fire near La Ronge Sask Canada
2,2,5,USA,1.0,fatality,All resident asked to shelter in place are bei...
3,3,6,USA,1.0,fatality,people receive wildfire evacuation order in Ca...
4,4,7,USA,1.0,fatality,Just got sent this photo from Ruby Alaska a sm...


In [96]:
my_data_all_clean[my_data_all_clean['target'].isna()==True].head()

Unnamed: 0,index,id,location,target,new_kw,new_text
7613,0,0,New York,,deluged,Just happened a terrible car crash
7614,1,2,New York,,deluged,Heard about earthquake is different city stay ...
7615,2,3,New York,,deluged,there is a forest fire at spot pond goose are ...
7616,3,9,New York,,deluged,Apocalypse lighting Spokane wildfire
7617,4,11,New York,,deluged,Typhoon Soudelor kill in China and Taiwan


In [97]:
my_data_all_clean["new_text"]=[my_text.lower() for my_text in my_data_all_clean["new_text"]]

In [98]:
my_data_all_clean.head()

Unnamed: 0,index,id,location,target,new_kw,new_text
0,0,1,USA,1.0,fatality,our deeds are the reason of this earthquake ma...
1,1,4,USA,1.0,fatality,forest fire near la ronge sask canada
2,2,5,USA,1.0,fatality,all resident asked to shelter in place are bei...
3,3,6,USA,1.0,fatality,people receive wildfire evacuation order in ca...
4,4,7,USA,1.0,fatality,just got sent this photo from ruby alaska a sm...


In [184]:
# we create a function that build new features from a text string
def string_features(my_string):
    my_length = len(my_string)
    my_words = len(my_string.split())
    largest_word=len(max(my_string.split(),key=len))
    mean_word_size=round(my_length/my_words,0)
    return (my_length, my_words, largest_word,mean_word_size)

In [185]:
my_data_all_clean["my_length"]=[string_features(my_string)[0] for my_string in my_data_all_clean["new_text"]]
my_data_all_clean["word_counts"]=[string_features(my_string)[1] for my_string in my_data_all_clean["new_text"]]
my_data_all_clean["largest_word"]=[string_features(my_string)[2] for my_string in my_data_all_clean["new_text"]]
my_data_all_clean["mean_word_size"]=[string_features(my_string)[3] for my_string in my_data_all_clean["new_text"]]

In [186]:
my_data_all_clean.head()

Unnamed: 0,index,id,location,target,new_kw,new_text,my_length,word_counts,largest_word,mean_word_size
0,0,1,USA,1.0,fatality,our deeds are the reason of this earthquake ma...,67,13,10,5.0
1,1,4,USA,1.0,fatality,forest fire near la ronge sask canada,37,7,6,5.0
2,2,5,USA,1.0,fatality,all resident asked to shelter in place are bei...,127,22,10,6.0
3,3,6,USA,1.0,fatality,people receive wildfire evacuation order in ca...,54,7,10,8.0
4,4,7,USA,1.0,fatality,just got sent this photo from ruby alaska a sm...,83,16,8,5.0


In [187]:
#we create back the train and test databases
my_train_data_clean=my_data_all_clean.dropna(axis=0, subset=['target'])
my_test_data_clean=my_data_all_clean[my_data_all_clean['target'].isna()==True]

In [253]:
# Define a list of stop words
stoplist = set('for a of the and to in to be which some is at that we i who whom show via may my our might as well'.split())

# we create a function that vectorizes the text of the targetted text data (fit_transform of train, transform of test)
# and returns the concatenation of both
def text_vectorizer(train_series, test_series):
    tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, min_df=0.001, max_df=0.9)
    return pd.concat([pd.DataFrame(tfidf_vectorizer.fit_transform(train_series).toarray()),pd.DataFrame(tfidf_vectorizer.transform(test_series).toarray())],axis=0).reset_index()

In [254]:
#tfidf_keyword=text_vectorizer(my_train_data_clean['new_kw'], my_test_data_clean['new_kw'])
tfidf_text=text_vectorizer(my_train_data_clean['new_text'], my_test_data_clean['new_text'])

In [255]:
#my_column_kw=[]
#for my_kw_index in range(tfidf_keyword.shape[1]):
#    my_column_kw.append('kw_'+str(my_kw_index))

In [256]:
my_column_text=[]
for my_text_index in range(tfidf_text.shape[1]-1):
    my_column_text.append('text_'+str(my_text_index))

In [257]:
my_column=['id','target','my_lengths','word_counts','largest_word','mean_word_size']+my_column_text


In [258]:
len(my_column)

1817

In [259]:
# we remove the text, location and keyword columns from the train and test databases and replace by the tfidf columns

my_new_data_all = pd.concat([my_data_all_clean,tfidf_text], axis=1)
#my_new_data_all = pd.concat([my_new_data_all,tfidf_text], axis=1, join='inner')


In [260]:
my_new_data_all.iloc[7613,0:10]

index                                              0
id                                                 0
location                                    New York
target                                           NaN
new_kw                                       deluged
new_text          just happened a terrible car crash
my_length                                         34
word_counts                                        6
largest_word                                       8
mean_word_size                                     6
Name: 7613, dtype: object

In [261]:
# we add a lebellized location column
#my_new_data_all['encoded_location']=my_new_data_all['location'].astype('category').cat.codes
# my_new_data_all=my_new_data_all.drop('keyword', axis=1)

In [262]:
my_new_data_all.columns

Index([         'index',             'id',       'location',         'target',
               'new_kw',       'new_text',      'my_length',    'word_counts',
         'largest_word', 'mean_word_size',
       ...
                   1801,             1802,             1803,             1804,
                   1805,             1806,             1807,             1808,
                   1809,             1810],
      dtype='object', length=1822)

In [263]:
my_new_data_all=my_new_data_all.drop(['index','new_kw','location','new_text'], axis=1)

In [264]:
my_new_data_all.shape

(10876, 1817)

In [265]:
#we set the new columns to the my_new_data_all
my_new_data_all.columns=my_column

In [266]:
#my_new_data_all['encoded_location'].value_counts()

In [267]:
my_new_data_all.head()

Unnamed: 0,id,target,my_lengths,word_counts,largest_word,mean_word_size,text_0,text_1,text_2,text_3,...,text_1801,text_1802,text_1803,text_1804,text_1805,text_1806,text_1807,text_1808,text_1809,text_1810
0,1,1.0,67,13,10,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,1.0,37,7,6,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,1.0,127,22,10,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,1.0,54,7,10,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,1.0,83,16,8,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### step5: we train a classifier model

In [268]:
# first we split the train database into some train and test points with split function
my_new_train_data=my_new_data_all.dropna(axis=0, subset=['target'])
my_y=my_new_train_data['target']
my_X=my_new_train_data.drop(['id','target'], axis=1)
scaler = MinMaxScaler()
X_scale=scaler.fit_transform(my_X.values)
X_train, X_test, y_train, y_test = train_test_split(X_scale,my_y, random_state=51,test_size=.02)
print(X_train.shape, X_test.shape)

(7460, 1815) (153, 1815)


In [269]:
# then, we create a LinearSVM model
tfidf_tweet_svc = LinearSVC(penalty='l2', dual=False, max_iter=20000, tol=0.00001, C=0.07).fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_svc_pred = tfidf_tweet_svc.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_svc_score = metrics.accuracy_score(y_test,tfidf_tweet_svc_pred)
print("LinearSVC Score:   %0.3f" % tfidf_tweet_svc_score)



LinearSVC Score:   0.810


In [176]:
# then, we create a NaiveBaies classifier model
tfidf_tweet_nb = MultinomialNB(alpha=1).fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_nb_pred = tfidf_tweet_nb.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_nb_score = metrics.accuracy_score(y_test,tfidf_tweet_nb_pred)
print("MultinomialNB Score:   %0.3f" % tfidf_tweet_nb_score)

# then, we create a logistic regression classifier model
tfidf_tweet_lr = LogisticRegression().fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_lr_pred = tfidf_tweet_lr.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_lr_score = metrics.accuracy_score(y_test,tfidf_tweet_lr_pred)
print("Logistic regression Score:   %0.3f" % tfidf_tweet_nb_score)

#then, # then, we create a gradient boost classifier model
tfidf_tweet_gb = GradientBoostingClassifier(n_estimators=100, random_state=2018).fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_gb_pred = tfidf_tweet_gb.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_gb_score = metrics.accuracy_score(y_test,tfidf_tweet_gb_pred)
print("GradientBoost Score:   %0.3f" % tfidf_tweet_gb_score)

rf = RandomForestClassifier(n_estimators=1000, max_depth=20,min_samples_split=2,random_state=2012)
#then, # then, we create a randomforest classifier model
tfidf_tweet_rf = rf.fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_rf_pred = tfidf_tweet_rf.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_rf_score = metrics.accuracy_score(y_test,tfidf_tweet_rf_pred)
print("RandomForest Score:   %0.3f" % tfidf_tweet_rf_score)

MultinomialNB Score:   0.798
Logistic regression Score:   0.798




GradientBoost Score:   0.769
RandomForest Score:   0.774


In [120]:
# Linear SVC works best

#### Step6: we save the predicted results 

In [270]:
final_X_test=my_new_data_all[my_new_data_all['target'].isna()==True]
my_X=final_X_test.drop(['id','target'], axis=1)
my_X_scale=scaler.transform(my_X.values)
final_X_test.loc[:,'target']=tfidf_tweet_svc.predict(my_X_scale)
final_X_test=final_X_test.loc[:,('id','target')]
final_X_test['id']=final_X_test['id'].astype('int32')
final_X_test=final_X_test.set_index('id')
final_X_test['target']=final_X_test['target'].astype('int32')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [271]:
final_X_test.to_csv('./NLP_submission_JM_6.csv')

In [272]:
#why does the new workflow gives so shitty predictions on the test sample while giving excellent train/test results on the train dataset ?

#### Step7: understanding the differences in predictions on the test matrix

In [273]:
sub_2=pd.read_csv('./NLP_submission_JM_2.csv')
sub_3=pd.read_csv('./NLP_submission_JM_3.csv')
sub_4=pd.read_csv('./NLP_submission_JM_4.csv')
sub_5=pd.read_csv('./NLP_submission_JM_5.csv')
sub_6=pd.read_csv('./NLP_submission_JM_6.csv')

In [274]:
print(sub_2['target'].value_counts())
print(sub_3['target'].value_counts())
print(sub_4['target'].value_counts())
print(sub_5['target'].value_counts())
print(sub_6['target'].value_counts())

0    2104
1    1159
Name: target, dtype: int64
0    2117
1    1146
Name: target, dtype: int64
0    2172
1    1091
Name: target, dtype: int64
0    2107
1    1156
Name: target, dtype: int64
0    2076
1    1187
Name: target, dtype: int64


In [275]:
compar_vector=[1 if sub_5.iloc[i,1]==sub_6.iloc[i,1] else 0 for i in range(sub_2.shape[0])]

In [276]:
round(sum(compar_vector)/len(compar_vector)*100,1)

92.3