In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

In [26]:
import pandas
disaster_tweets = pandas.read_csv("../data/train.csv")
disaster_tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [27]:
disaster_tweets.shape

(7613, 5)

# 1. Cleaning et preparation du texte

## 1.1 Tokenisation

In [28]:
print(disaster_tweets.loc[2]['text'])

All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected


In [29]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

table = str.maketrans('', '', string.punctuation)

def tokenizer(df):
    df['text_tokenized'] = df['text'].apply(lambda text: str(text).lower())
    df['text_tokenized'] = df['text_tokenized'].str.replace("\r", " ")
    df['text_tokenized'] = df['text_tokenized'].str.replace("\n", " ")
    df['text_tokenized'] = df['text_tokenized'].apply(lambda text: re.split(r'\W+', text)) 
    df['text_tokenized'] = df['text_tokenized'].apply(lambda words: [w.translate(table) for w in words]) 

tokenizer(disaster_tweets)
print(disaster_tweets.loc[2]['text_tokenized'])

['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']


## 1.2 Stop words

In [30]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

disaster_tweets['text_tokenized_no_stopwords'] = disaster_tweets['text_tokenized'].apply(lambda x: remove_stopwords(x))
print(disaster_tweets.loc[2]['text_tokenized_no_stopwords'])

['residents', 'asked', 'shelter', 'place', 'notified', 'officers', 'evacuation', 'shelter', 'place', 'orders', 'expected']


## 1.3 Lemmatization

In [31]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def lemmatize_text(text):
    return [wnl.lemmatize(word, pos='v') for word in text]

disaster_tweets['text_lemmatized'] = disaster_tweets['text_tokenized_no_stopwords'].apply(lambda x: lemmatize_text(x))
print(disaster_tweets.loc[2]['text_lemmatized'])

['residents', 'ask', 'shelter', 'place', 'notify', 'officer', 'evacuation', 'shelter', 'place', 'order', 'expect']


In [32]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vector2 = tfidf_vectorizer.fit_transform(disaster_tweets["text_lemmatized"])

tfidf_vectorizer.vocabulary_

AttributeError: 'list' object has no attribute 'lower'

In [None]:
tfidf_vector2.shape

# 2. Modèle

## 2.1 Split en train et test

In [None]:
X = tfidf_vector2
Y = disaster_tweets['target']

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.todense(), Y)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.naive_bayes import GaussianNB

In [None]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [None]:
clf = GaussianNB().fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
summarize_classification(y_test,y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
pred_results = pd.DataFrame({'y_test': pd.Series(y_test),
                             'y_pred': pd.Series(y_pred)})

pred_results.sample(20)

# Generating submission file

In [22]:
submission_example_df = pandas.read_csv("../data/sample_submission.csv")
submission_example_df.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [23]:
test_df = pandas.read_csv("../data/test.csv")
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [24]:
prepare_dataframe(df):
    tokenizer(df)
    df['text_prepared'] = df['text_tokenized'].apply(lambda x: remove_stopwords(x))
    df['text_lemmatized'] = df['text_prepared'].apply(lambda x: lemmatize_text(x))
    tfidf_vector_sample = tfidf_vectorizer.fit_transform(df["text_lemmatized"])
    tfidf_vectorizer.vocabulary_
    return clf.predict(X_to_submit.todense())



SyntaxError: invalid syntax (<ipython-input-24-9580793630a9>, line 1)

In [None]:
y_pred = clf.predict()
y_pred