#### step1: import the proper packages to the notebook environment

In [215]:
#import packages we may need
import pandas as pd
import numpy as np
from pprint import pprint
import re

# Set seed for reproducibility
import random; random.seed(53)

#import some specific NPL packages
import nltk

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

#### step2: load the test and train dataset

In [216]:
data_train=pd.read_csv('./train.csv')
data_test=pd.read_csv('./test.csv')

#### step3: descriptive exploration of the 2 datasets

In [217]:
# function to extract the main features
def data_set_exploration(dataset):
    print(dataset.shape)
    print(dataset.columns)
    print('\n')
    print(dataset.isnull().sum())

In [4]:
data_set_exploration(data_train)

(7613, 5)
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [5]:
#there are 7613 tweets in the train database with 61 keyword missing and 2533 location missing

In [6]:
data_set_exploration(data_test)

(3263, 4)
Index(['id', 'keyword', 'location', 'text'], dtype='object')


id             0
keyword       26
location    1105
text           0
dtype: int64


In [7]:
#there are 3263 tweets in the test database with 26 keyword missing and 1103 location missing

In [8]:
#the ratio of missing data in the 2 datasets are roughly the same: good!

In [9]:
# we create a function that fill missing values with mean value
def my_fill_na_function(dataset):
    for my_column in dataset.columns:
        max_value= dataset[my_column].value_counts().index[0]
        dataset[my_column]=dataset[my_column].fillna(max_value)
        print('the max_value of column %s is %s' %(my_column,max_value))

In [10]:
my_fill_na_function(data_test)

the max_value of column id is 10235
the max_value of column keyword is deluged
the max_value of column location is New York
the max_value of column text is 11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...


In [11]:
data_test.isnull().sum()

id          0
keyword     0
location    0
text        0
dtype: int64

In [12]:
my_fill_na_function(data_train)

the max_value of column id is 2047
the max_value of column keyword is fatalities
the max_value of column location is USA
the max_value of column text is 11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...
the max_value of column target is 0


In [13]:
# we create a function that replaces the non alpha characters from the strings in the database befor next steps

def my_remove_nonalpha(dataset):
    for my_col in ['keyword','location','text']:
        my_pattern=re.compile('[^A-Za-z]+')
        dataset[my_col]=[my_pattern.sub(' ',my_text) for my_text in dataset[my_col]]

In [14]:
my_remove_nonalpha(data_train)
my_remove_nonalpha(data_test)

In [15]:
# we concatenate horizonatally the 2 databases
my_data_all=pd.concat([data_train,data_test], axis=0)
data_set_exploration(my_data_all)

(10876, 5)
Index(['id', 'keyword', 'location', 'target', 'text'], dtype='object')


id             0
keyword        0
location       0
target      3263
text           0
dtype: int64


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


#### step4: start the NLP transformation of the database content

In [16]:
# we are going to tokenize and stem the 'keyword' columns of the test and train datasets merge in my_data_all
from nltk.stem import PorterStemmer
porter=PorterStemmer()
keyword_split=[my_keyword.split() for my_keyword in my_data_all['keyword']]
new_keyword_split=pd.DataFrame(keyword_split)

my_row_max=new_keyword_split.shape[0]
my_col_max=new_keyword_split.shape[1]
my_count=0
for my_col in range(my_col_max):
    new_keyword_split.iloc[:,my_col].astype(str)
    for my_row in range(my_row_max):
        if new_keyword_split.iloc[my_row,my_col] is not None:
            try: 
                new_keyword_split.iloc[my_row,my_col]=porter.stem(new_keyword_split.iloc[my_row,my_col])
            except: 
                my_count+=1
print(my_count)

0


In [17]:
# we now have a reduced amount of keywords that have been stemmed and split into 3 columns
new_keyword_split.iloc[:,0].value_counts()

fatal       211
scream      150
suicid      150
obliter     150
emerg       150
           ... 
war          35
battl        33
threat       16
radiat       14
epicentr     13
Name: 0, Length: 159, dtype: int64

In [18]:
# we join the 3 columns in one sole column
new_keyword_split['final_kw']=new_keyword_split.iloc[:,0:2].apply(lambda x: None if x.isnull().all() else ' '.join(x.dropna()), axis=1)

# we only keep the first column
my_new_keyword=new_keyword_split['final_kw']

#how many individual items?: 166
my_new_keyword.value_counts()

fatal           211
flood           150
hijack          150
drown           150
scream          150
               ... 
war zone         35
battl            33
threat           16
radiat emerg     14
epicentr         13
Name: final_kw, Length: 166, dtype: int64

In [19]:
# we are going to tokenize and stem the 'text' columns of the test and train datasets merge in my_data_all
from nltk.stem import PorterStemmer
porter=PorterStemmer()
text_split=[my_text.split() for my_text in my_data_all['text']]
new_text_split=pd.DataFrame(text_split)

my_row_max=new_text_split.shape[0]
my_col_max=new_text_split.shape[1]
my_count=0
for my_col in range(my_col_max):
    new_text_split.iloc[:,my_col].astype(str)
    for my_row in range(my_row_max):
        if new_text_split.iloc[my_row,my_col] is not None:
            try: 
                new_text_split.iloc[my_row,my_col]=porter.stem(new_text_split.iloc[my_row,my_col])
            except: 
                my_count+=1
print(my_count)

0


In [20]:
new_text_split.shape

(10876, 33)

In [21]:
# we join all the columns in one sole column
new_text_split['final_text']=new_text_split.apply(lambda x: None if x.isnull().all() else ' '.join(x.dropna()), axis=1)

# we only keep the first column
my_new_text=new_text_split['final_text']

In [22]:
my_new_text.head()

0    our deed are the reason of thi earthquak may a...
1                 forest fire near La rong sask canada
2    all resid ask to shelter in place are be notif...
3       peopl receiv wildfir evacu order in california
4    just got sent thi photo from rubi alaska as sm...
Name: final_text, dtype: object

In [23]:
#we create the new dataframe
my_data_all['new_kw']=my_new_keyword
my_data_all['new_text']=my_new_text
my_data_all=my_data_all.drop(['keyword','text'], axis=1)

In [24]:
my_data_all.head()

Unnamed: 0,id,location,target,new_kw,new_text
0,1,USA,1.0,fatal,our deed are the reason of thi earthquak may a...
1,4,USA,1.0,fatal,forest fire near La rong sask canada
2,5,USA,1.0,fatal,all resid ask to shelter in place are be notif...
3,6,USA,1.0,fatal,peopl receiv wildfir evacu order in california
4,7,USA,1.0,fatal,just got sent thi photo from rubi alaska as sm...


In [109]:
#we save the tokenized and stemmed database
my_data_all.to_csv('./my_data_all_stemmed.csv')

In [146]:
#we load back the database
my_data_all_clean=pd.read_csv('./my_data_all_stemmed.csv', index_col=0)

In [149]:
#we create back the train and test databases
my_train_data_clean=my_data_all_clean.dropna(axis=0, subset=['target'])
my_test_data_clean=my_data_all_clean[my_data_all_clean['target'].isna()==True]

In [184]:
# Define a list of stop words
stoplist = set('for a of the and to in to be which some is at that we i who whom show via may my our might as well'.split())

# we create a function that vectorizes the text of the targetted text data (fit_transform of train, transform of test)
# and returns the concatenation of both
def text_vectorizer(train_series, test_series):
    tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, min_df=0.005, max_df=0.9)
    return pd.concat([pd.DataFrame(tfidf_vectorizer.fit_transform(train_series).toarray()),pd.DataFrame(tfidf_vectorizer.transform(test_series).toarray())],axis=0)

In [185]:
tfidf_keyword=text_vectorizer(my_train_data_clean['new_kw'], my_test_data_clean['new_kw'])
tfidf_text=text_vectorizer(my_train_data_clean['new_text'], my_test_data_clean['new_text'])

In [186]:
my_column_kw=[]
for my_kw_index in range(tfidf_keyword.shape[1]):
    my_column_kw.append('kw_'+str(my_kw_index))

In [187]:
my_column_text=[]
for my_text_index in range(tfidf_text.shape[1]):
    my_column_text.append('text_'+str(my_text_index))

In [188]:
my_column=['id','target']+my_column_kw+my_column_text+['encoded_location']


In [189]:
len(my_column)

482

In [190]:
# we remove the text, location and keyword columns from the train and test databases and replace by the tfidf columns

my_new_data_all = pd.concat([my_data_all_clean,tfidf_keyword], axis=1)
my_new_data_all = pd.concat([my_new_data_all,tfidf_text], axis=1, join='inner')


In [191]:
# we add a lebellized location column
my_new_data_all['encoded_location']=my_new_data_all['location'].astype('category').cat.codes
# my_new_data_all=my_new_data_all.drop('keyword', axis=1)

In [192]:
my_new_data_all=my_new_data_all.drop(['new_kw','location','new_text'], axis=1)

In [193]:
#we set the new columns to the my_new_data_all
my_new_data_all.columns=my_column

In [194]:
my_new_data_all['encoded_location'].value_counts()

3413    2674
2312    1214
0        140
3441      65
1910      58
        ... 
21         1
2068       1
4111       1
13         1
2043       1
Name: encoded_location, Length: 4312, dtype: int64

#### step5: we train a classifier model

In [208]:
# first we split the train database into some train and test points with split function
my_new_train_data=my_new_data_all.dropna(axis=0, subset=['target'])
my_y=my_new_train_data['target']
my_X=my_new_train_data.drop(['id','target','encoded_location'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(my_X,my_y, random_state=49,test_size=.33)

In [209]:
# then, we create a LinearSVM model
tfidf_tweet_svc = LinearSVC(penalty='l2', dual=False, max_iter=10000, tol=0.001, C=0.1).fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_svc_pred = tfidf_tweet_svc.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_svc_score = metrics.accuracy_score(y_test,tfidf_tweet_svc_pred)
print("LinearSVC Score:   %0.3f" % tfidf_tweet_svc_score)

# then, we create a NaiveBaies classifier model
tfidf_tweet_nb = MultinomialNB(alpha=1).fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_nb_pred = tfidf_tweet_nb.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_nb_score = metrics.accuracy_score(y_test,tfidf_tweet_nb_pred)
print("MultinomialNB Score:   %0.3f" % tfidf_tweet_nb_score)

# then, we create a logistic regression classifier model
tfidf_tweet_lr = LogisticRegression().fit(X_train,y_train)
# we run predict on the X_test to get predictions
tfidf_tweet_lr_pred = tfidf_tweet_lr.predict(X_test)
# we Calculate accuracy using the metrics module
tfidf_tweet_lr_score = metrics.accuracy_score(y_test,tfidf_tweet_lr_pred)
print("MultinomialNB Score:   %0.3f" % tfidf_tweet_nb_score)

LinearSVC Score:   0.774
MultinomialNB Score:   0.761
MultinomialNB Score:   0.761




In [168]:
# Linear SVC works best

#### Step6: we save the predicted results 

In [210]:
final_X_test=my_new_data_all[my_new_data_all['target'].isna()==True]
my_X=final_X_test.drop(['id','target','encoded_location'], axis=1)
final_X_test.loc[:,'target']=tfidf_tweet_svc.predict(my_X)
final_X_test=final_X_test.loc[:,('id','target')]
final_X_test['id']=final_X_test['id'].astype('int32')
final_X_test=final_X_test.set_index('id')
final_X_test['target']=final_X_test['target'].astype('int32')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [211]:
final_X_test.to_csv('./NLP_submission_JM_4.csv')

In [212]:
#why does the new workflow gives so shitty predictions on the test sample while giving excellent train/test results on the train dataset ?

#### Step7: understanding the differences in predictions on the test matrix

In [213]:
sub_2=pd.read_csv('./NLP_submission_JM_2.csv')
sub_3=pd.read_csv('./NLP_submission_JM_3.csv')
sub_4=pd.read_csv('./NLP_submission_JM_4.csv')

In [214]:
print(sub_2['target'].value_counts())
print(sub_3['target'].value_counts())
print(sub_4['target'].value_counts())

0    2104
1    1159
Name: target, dtype: int64
0    2117
1    1146
Name: target, dtype: int64
0    2172
1    1091
Name: target, dtype: int64
