In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

In [8]:
import pandas
disaster_tweets = pandas.read_csv("../data/train.csv")
disaster_tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
disaster_tweets.shape

(7613, 5)

# 1. Cleaning et preparation du texte

## 1.1 Tokenisation

In [16]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

table = str.maketrans('', '', string.punctuation)

disaster_tweets['text_tokenized'] = disaster_tweets['text'].apply(lambda text: str(text).lower())
disaster_tweets['text_tokenized'] = disaster_tweets['text_tokenized'].str.replace("\r", " ")
disaster_tweets['text_tokenized'] = disaster_tweets['text_tokenized'].str.replace("\n", " ")
disaster_tweets['text_tokenized'] = disaster_tweets['text_tokenized'].apply(lambda text: re.split(r'\W+', text)) 
disaster_tweets['text_tokenized'] = disaster_tweets['text_tokenized'].apply(lambda words: [w.translate(table) for w in words]) 
print(disaster_tweets.loc[0]['text_tokenized'])

['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']


## 1.2 Stop words

In [18]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words
disaster_tweets['text_tokenized_no_stopwords'] = disaster_tweets['text_tokenized'].apply(lambda x: remove_stopwords(x))
print(disaster_tweets.loc[0]['text_tokenized_no_stopwords'])

['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']


## 1.3 Lemmatization

In [None]:
nlp = spacy.load('it_core_news_sm')

def lemmatize_text(text):
    doc = nlp(" ".join(text))
    return [word.lemma_ for word in doc]

df['body_3'] = df['body_2'].apply(lambda x: lemmatize_text(x))
df['body_cleaned'] = df['body_3'].apply(lambda x: " ".join(x))
print(df.loc[0]['body_3'])

In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vector2 = tfidf_vectorizer.fit_transform(disaster_tweets["text"])

tfidf_vectorizer.vocabulary_

{'our': 14003,
 'deeds': 5490,
 'are': 2192,
 'the': 18669,
 'reason': 15678,
 'of': 13681,
 'this': 18777,
 'earthquake': 6379,
 'may': 12141,
 'allah': 1852,
 'forgive': 7661,
 'us': 19774,
 'all': 1851,
 'forest': 7652,
 'fire': 7439,
 'near': 13122,
 'la': 11091,
 'ronge': 16266,
 'sask': 16611,
 'canada': 3843,
 'residents': 15940,
 'asked': 2312,
 'to': 18971,
 'shelter': 17022,
 'in': 9718,
 'place': 14612,
 'being': 2900,
 'notified': 13423,
 'by': 3698,
 'officers': 13701,
 'no': 13335,
 'other': 13987,
 'evacuation': 6909,
 'or': 13919,
 'orders': 13936,
 'expected': 7014,
 '13': 176,
 '000': 1,
 'people': 14389,
 'receive': 15699,
 'wildfires': 20607,
 'california': 3797,
 'just': 10550,
 'got': 8364,
 'sent': 16870,
 'photo': 14500,
 'from': 7823,
 'ruby': 16379,
 'alaska': 1800,
 'as': 2280,
 'smoke': 17394,
 'pours': 14821,
 'into': 9947,
 'school': 16700,
 'rockyfire': 16219,
 'update': 19722,
 'hwy': 9399,
 '20': 343,
 'closed': 4462,
 'both': 3346,
 'directions': 5819,

In [14]:
tfidf_vector2.shape

(7613, 21637)