In [51]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

In [52]:
import pandas
disaster_tweets = pandas.read_csv("../data/train.csv")
disaster_tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [53]:
disaster_tweets.shape

(7613, 5)

# 1. Cleaning et preparation du texte

## 1.1 Tokenisation

In [54]:
print(disaster_tweets.loc[2]['text'])

All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected


In [55]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

table = str.maketrans('', '', string.punctuation)

def tokenizer(df):
    df['text_tokenized'] = df['text'].apply(lambda text: str(text).lower())
    df['text_tokenized'] = df['text_tokenized'].str.replace("\r", " ")
    df['text_tokenized'] = df['text_tokenized'].str.replace("\n", " ")
    df['text_tokenized'] = df['text_tokenized'].apply(lambda text: re.split(r'\W+', text)) 
    df['text_tokenized'] = df['text_tokenized'].apply(lambda words: [w.translate(table) for w in words]) 

tokenizer(disaster_tweets)
print(disaster_tweets.loc[2]['text_tokenized'])

['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']


## 1.2 Stop words

In [56]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

disaster_tweets['text_tokenized_no_stopwords'] = disaster_tweets['text_tokenized'].apply(lambda x: remove_stopwords(x))
print(disaster_tweets.loc[2]['text_tokenized_no_stopwords'])

['residents', 'asked', 'shelter', 'place', 'notified', 'officers', 'evacuation', 'shelter', 'place', 'orders', 'expected']


## 1.3 Lemmatization

In [57]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def lemmatize_text(text):
    return [wnl.lemmatize(word, pos='v') for word in text]

disaster_tweets['text_lemmatized'] = disaster_tweets['text_tokenized_no_stopwords'].apply(lambda x: lemmatize_text(x))
disaster_tweets["text_lemmatized"] = disaster_tweets["text_lemmatized"].apply(lambda x: " ".join(x))
print(disaster_tweets.loc[2]['text_lemmatized'])

residents ask shelter place notify officer evacuation shelter place order expect


In [58]:

tfidf_vectorizer = TfidfVectorizer()
tfidf_vector = tfidf_vectorizer.fit_transform(disaster_tweets["text_lemmatized"])

tfidf_vectorizer.vocabulary_

{'deeds': 4988,
 'reason': 14274,
 'earthquake': 5766,
 'may': 11018,
 'allah': 1773,
 'forgive': 6888,
 'us': 17843,
 'forest': 6882,
 'fire': 6706,
 'near': 11951,
 'la': 10065,
 'ronge': 14773,
 'sask': 15096,
 'canada': 3536,
 'residents': 14482,
 'ask': 2194,
 'shelter': 15448,
 'place': 13334,
 'notify': 12236,
 'officer': 12496,
 'evacuation': 6257,
 'order': 12722,
 'expect': 6346,
 '13': 176,
 '000': 1,
 'people': 13133,
 'receive': 14291,
 'wildfires': 18606,
 'california': 3497,
 'get': 7323,
 'send': 15306,
 'photo': 13239,
 'ruby': 14882,
 'alaska': 1724,
 'smoke': 15754,
 'pour': 13513,
 'school': 15176,
 'rockyfire': 14728,
 'update': 17797,
 'hwy': 8464,
 '20': 343,
 'close': 4082,
 'directions': 5273,
 'due': 5641,
 'lake': 10089,
 'county': 4473,
 'cafire': 3470,
 'flood': 6795,
 'disaster': 5290,
 'heavy': 8026,
 'rain': 14150,
 'cause': 3684,
 'flash': 6765,
 'streets': 16250,
 'manitou': 10873,
 'colorado': 4209,
 'spring': 16019,
 'areas': 2090,
 'top': 17176,
 'h

In [59]:
tfidf_vector.shape

(7613, 19582)

# 2. Modèle

## 2.1 Split en train et test

In [60]:
X = tfidf_vector
Y = disaster_tweets['target']

In [61]:
X.shape

(7613, 19582)

In [22]:
Y.shape

(7613,)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.todense(), Y)

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.naive_bayes import GaussianNB

In [25]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [26]:
clf = GaussianNB().fit(X_train, y_train)

In [45]:
X_test.shape

(1904, 19582)

In [27]:
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 0, ..., 1, 1, 0])

In [28]:
summarize_classification(y_test,y_pred)

Length of testing data:  1904
accuracy_count :  1171
accuracy_score :  0.6150210084033614
precision_score :  0.64790435798662
recall_score :  0.6150210084033614


In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[527, 537],
       [196, 644]])

In [30]:
pred_results = pd.DataFrame({'y_test': pd.Series(y_test),
                             'y_pred': pd.Series(y_pred)})

pred_results.sample(20)

Unnamed: 0,y_test,y_pred
1303,,1.0
1095,1.0,1.0
252,,1.0
1741,1.0,1.0
6820,0.0,
1620,1.0,1.0
1805,,1.0
623,,1.0
4608,0.0,
947,,1.0


# Generating submission file

In [39]:
submission_example_df = pandas.read_csv("../data/sample_submission.csv")
submission_example_df.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [40]:
test_df = pandas.read_csv("../data/test.csv")
print(test_df.head())
print(test_df.shape)

   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan
(3263, 4)


In [62]:
def prepare_dataframe(df):
    tokenizer(df)
    df['text_prepared'] = df['text_tokenized'].apply(lambda x: remove_stopwords(x))
    df['text_lemmatized'] = df['text_prepared'].apply(lambda x: lemmatize_text(x))
    df['text_lemmatized'] = df['text_prepared'].apply(lambda x: " ".join(x))

prepare_dataframe(test_df)
tfidf_vector_sample = tfidf_vectorizer.transform(test_df["text_lemmatized"])
X_to_submit = tfidf_vector_sample.todense()
X_to_submit.shape

(3263, 19582)

In [63]:
X_to_submit.shape

(3263, 19582)

In [64]:
y_pred_sample = clf.predict(X_to_submit)
y_pred_sample

array([1, 1, 1, ..., 1, 1, 1])

In [65]:
y_pred_sample.shape

(3263,)

In [66]:
submission_sample = test_df["id"]

In [67]:
submission_sample.head

<bound method NDFrame.head of 0           0
1           2
2           3
3           9
4          11
        ...  
3258    10861
3259    10865
3260    10868
3261    10874
3262    10875
Name: id, Length: 3263, dtype: int64>

In [83]:
dataset = pd.DataFrame({'target': y_pred_sample})
dataset

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1
...,...
3258,1
3259,1
3260,1
3261,1


In [84]:
dataset["id"] = test_df["id"]

In [87]:
dataset[["id","target"]].to_csv("../data/submission_1.csv", index=False)