# Imports

In [136]:
import re
import string
import numpy as np 
import random
import pandas as pd 



import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
#import plotly.express as px
#import plotly.figure_factory as ff
#from collections import Counter

#from PIL import Image
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#from tqdm import tqdm
#import os
#import spacy
#from spacy.util import compounding
#from spacy.util import minibatch

#from collections import defaultdict
#from collections import Counter

import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import (LSTM, 
                          Embedding, 
                          BatchNormalization,
                          Dense, 
                          TimeDistributed, 
                          Dropout, 
                          Bidirectional,
                          Flatten, 
                          GlobalMaxPool1D)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
#from keras.optimizer import Adam

from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report,
    accuracy_score
)

In [239]:
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

# 1. Loading Data

In [138]:
df = pd.read_csv("./data/spam.csv", encoding="latin-1")

df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']

df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [139]:
df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))
df.head()

Unnamed: 0,target,message,message_len
0,ham,"Go until jurong point, crazy.. Available only ...",20
1,ham,Ok lar... Joking wif u oni...,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,ham,U dun say so early hor... U c already then say...,11
4,ham,"Nah I don't think he goes to usf, he lives aro...",13


In [140]:
max(df['message_len'])

171

# 2. EDA 

In [146]:
# agg = Aggregate using one or more operations over the specified axis.
balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts

array([4825,  747], dtype=int64)

In [28]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=['ham'],
    y=[balance_counts[0]],
    name='ham',
    text=[balance_counts[0]],
    textposition='auto',
))
fig.add_trace(go.Bar(
    x=['spam'],
    y=[balance_counts[1]],
    name='spam',
    text=[balance_counts[1]],
    textposition='auto',
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()

# 3. Data Pre-processing

- Now we are going to engineering the data to make it easier for the model to clasiffy.
- This section is very important to reduce the dimensions of the problem.

### 3.1 Cleaning the text

In [30]:
# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [33]:
df['message_clean'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,11,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah i dont think he goes to usf he lives aroun...


In [36]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry wkly comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goes usf lives around though


### 3.2 Stemming 

In [37]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [38]:
df['message_clean'] = df['message_clean'].apply(stemm_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


### 3.3 All together 

In [39]:
def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [40]:
df['message_clean'] = df['message_clean'].apply(preprocess_data)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say ear hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


### 3.4 Target encoding

In [41]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['target'])

df['target_encoded'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,message,message_len,message_clean,target_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...,0
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...,1
3,ham,U dun say so early hor... U c already then say...,11,dun say ear hor alreadi say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though,0


# 4. Tokens visualization 

# 5. Vectorization

In [70]:
# how to define X and y (from the SMS data) for use with CountVectorizer
x = df['message_clean']
y = df['target_encoded']


print(len(x), "," ,len(y))

5572 , 5572


In [71]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)


print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

4179 4179
1393 1393


In [72]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer conta quantos tokens, de cada tipo, aparecem em cada mensagem e criam um vetor para cada mensagem.
vect = CountVectorizer()
vect.fit(x_train)

CountVectorizer()

In [73]:
# Use the trained to create a document-term matrix from train and test sets
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)

print(x_train_dtm.shape)
print(x_test_dtm.shape)

(4179, 5684)
(1393, 5684)


### 5.1 Tunning CountVectorizer

1. stop_words: Stop words are just a list of words you don’t want to use as features. (stop_words=’english’ to use a built-in list. Alternatively you can set stop_words equal to some custom list)


2. ngram_range: Set the parameter ngram_range=(a,b) where a is the minimum and b is the maximum size of ngrams you want to include in your features.


3. min_df, max_df: These are the minimum and maximum document frequencies words/n-grams must have to be used as features. (If either is set to a float, that number will be interpreted as a frequency rather than a numerical limit. min_df defaults to 1 (int) and max_df defaults to 1.0 (float))


4. max_features: This parameter is pretty self-explanatory. The CountVectorizer will choose the words/features that occur most frequently to be in its’ vocabulary and drop everything else.

In [68]:
vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=10, max_df=1.0, max_features=300)
vect_tunned.fit(x_train)

CountVectorizer(max_features=300, min_df=10, ngram_range=(1, 2),
                stop_words='english')

In [69]:
# Use the trained to create a document-term matrix from train and test sets
x_train_dtm_tunned = vect_tunned.transform(x_train)
x_test_dtm_tunned = vect_tunned.transform(x_test)

print(x_train_dtm_tunned.shape)
print(x_test_dtm_tunned.shape)

(4179, 300)
(1393, 300)


### 5.2 TF-IDF

In [75]:
# Matriz documento-termo que considera a frequencia que os termos aparecem nos documentos.
from sklearn.feature_extraction.text import TfidfTransformer

TF = TfidfTransformer()
TF.fit(x_train_dtm)

TfidfTransformer()

In [149]:
x_train_tfidf = TF.transform(x_train_dtm)
x_test_tfidf = TF.transform(x_test_dtm)

print(x_train_tfidf.shape)
print(x_test_tfidf.shape)

(4179, 5684)
(1393, 5684)


### 5.3 Word Embeddings: GloVe

In [103]:
texts = df['message_clean']
target = df['target_encoded']

In [150]:
# Tokenizer is from Keras.preprocessing.text
# Calculate the length of our vocabulary

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

6726

In [105]:
def embed(text): 
    return word_tokenizer.texts_to_sequences(text)

longest_train = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
#longest_train
length_long_sentence = len(word_tokenize(longest_train))
#length_long_sentence

##### 5.3.1 Pad_sequences
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

In [153]:
# Fazendo uma matriz onde cada linha é a tokenização de um texto.
list_of_lists = embed(texts)

train_padded_sentences = pad_sequences(
    list_of_lists, 
    length_long_sentence, 
    padding='post'
)

train_padded_sentences.shape

(5572, 80)

##### 5.3.2 GloVe

- GloVe method is built on an important idea:
    - You can derive semantic relationships between words from the co-occurrence matrix.



In [123]:
embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
with open('./data/glove.6B.100d.txt', encoding="utf8") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

#embeddings_dictionary

In [151]:
# Now we will load embedding vectors of those words that appear in the
# Glove dictionary. Others will be initialized to 0.

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix.shape

(6726, 100)

# 6. Modeling

In [163]:
# Create a Multinomial Naive Bayes model

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

# Treinando o modelo com os textos vetorizados pelo CountVectorizer
nb.fit(x_train_tfidf, y_train)

MultinomialNB()

### 6.1 Naive Bayes DTM

In [164]:
# Make class and probability predictions
y_pred_class = nb.predict(x_test_dtm)
y_pred_prob = nb.predict_proba(x_test_dtm)[:, 1]

y_pred_class.shape

(1393,)

In [165]:
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

metrics.confusion_matrix(y_test, y_pred_class)

0.9748743718592965


array([[1199,    3],
       [  32,  159]], dtype=int64)

In [None]:
0.9784637473079684
array([[1191,   11],
       [  19,  172]], dtype=int64)

In [158]:
# Calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.974296765425861

### 6.2 Naive Bayes

In [159]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])

In [160]:
# Fit the pipeline with the data
pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)

print(metrics.accuracy_score(y_test, y_pred_class))

metrics.confusion_matrix(y_test, y_pred_class)

0.9597989949748744


array([[1202,    0],
       [  56,  135]], dtype=int64)

### 6.3 XGBoost

In [161]:
import xgboost as xgb

pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc',
        # colsample_bytree=0.8,
        # subsample=0.7,
        # min_child_weight=5,
    ))
])

In [162]:
# Fit the pipeline with the data
pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

metrics.confusion_matrix(y_test, y_pred_class)

Train: 0.9834888729361091
Test: 0.9662598707824839


array([[1198,    4],
       [  43,  148]], dtype=int64)

# 7. LSTM (Deep learning model)

In [166]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences, 
    target, 
    test_size=0.25
)

In [168]:
# Model from https://www.kaggle.com/mariapushkareva/nlp-disaster-tweets-with-glove-and-lstm/data

def glove_lstm():
    model = Sequential()
    
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence
    ))
    
    model.add(Bidirectional(LSTM(
        length_long_sentence, 
        return_sequences = True, 
        recurrent_dropout=0.2
    )))
    
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = glove_lstm()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 100)           672600    
                                                                 
 bidirectional (Bidirectiona  (None, 80, 160)          115840    
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 160)              0         
 lMaxPooling1D)                                                  
                                                                 
 batch_normalization (BatchN  (None, 160)              640       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 160)               0         
                                                        

In [169]:
# Load the model and train!!

model = glove_lstm()

checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor = 'val_loss', 
    verbose = 1, 
    save_best_only = True
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)
history = model.fit(
    X_train, 
    y_train, 
    epochs = 7,
    batch_size = 32,
    validation_data = (X_test, y_test),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint]
)

Epoch 1/7
Epoch 00001: val_loss improved from inf to 0.40072, saving model to model.h5
Epoch 2/7
Epoch 00002: val_loss improved from 0.40072 to 0.16010, saving model to model.h5
Epoch 3/7
Epoch 00003: val_loss improved from 0.16010 to 0.10229, saving model to model.h5
Epoch 4/7
Epoch 00004: val_loss improved from 0.10229 to 0.09631, saving model to model.h5
Epoch 5/7
Epoch 00005: val_loss did not improve from 0.09631
Epoch 6/7
Epoch 00006: val_loss did not improve from 0.09631
Epoch 7/7
Epoch 00007: val_loss did not improve from 0.09631


In [175]:
y_preds = (model.predict(X_test) > 0.5).astype("int32")

print(metrics.accuracy_score(y_test, y_preds))
metrics.confusion_matrix(y_test, y_preds)

0.9798994974874372


array([[1195,   10],
       [  18,  170]], dtype=int64)

- Melhor modelo até o momento

# 8. NLP: Disaster Tweets

In [212]:
df = pd.read_csv("./data/train.csv", encoding="latin-1")
test_df = pd.read_csv("./data/test.csv", encoding="latin-1")

df = df.dropna(how="any", axis=1)
df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')))

df.head()

Unnamed: 0,id,text,target,text_len
0,1,Our Deeds are the Reason of this #earthquake M...,1,13
1,4,Forest fire near La Ronge Sask. Canada,1,7
2,5,All residents asked to 'shelter in place' are ...,1,22
3,6,"13,000 people receive #wildfires evacuation or...",1,9
4,7,Just got sent this photo from Ruby #Alaska as ...,1,17


### 8.1 EDA

In [213]:
balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts

array([4342, 3271], dtype=int64)

In [214]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=['Fake'],
    y=[balance_counts[0]],
    name='Fake',
    text=[balance_counts[0]],
    textposition='auto',
))
fig.add_trace(go.Bar(
    x=['Real disaster'],
    y=[balance_counts[1]],
    name='Real disaster',
    text=[balance_counts[1]],
    textposition='auto',
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()

### 8.2 Data preprocessing

In [215]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
        '', 
        text
    )
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    text = remove_url(text)
    text = remove_emoji(text)
    text = remove_html(text)
    
    return text

In [216]:
# Test emoji removal
remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [269]:
# Stops words

stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords




# Stemming

stemmer = nltk.SnowballStemmer("english")


def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords and Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' ') if word not in stop_words)

    return text

In [260]:
test_df['text_clean'] = test_df['text'].apply(preprocess_data)

df['text_clean'] = df['text'].apply(preprocess_data)
df.head()

Unnamed: 0,id,text,target,text_len,text_clean
0,1,Our Deeds are the Reason of this #earthquake M...,1,13,deeds reason earthquake may allah forgive us
1,4,Forest fire near La Ronge Sask. Canada,1,7,forest fire near la ronge sask canada
2,5,All residents asked to 'shelter in place' are ...,1,22,residents asked shelter place notified officer...
3,6,"13,000 people receive #wildfires evacuation or...",1,9,people receive wildfires evacuation orders ca...
4,7,Just got sent this photo from Ruby #Alaska as ...,1,17,got sent photo ruby alaska smoke wildfires pou...


### 8.3 WordCloud

In [261]:
def create_corpus_df(tweet, target):
    corpus=[]
    
    # Sem o ".str", o x é uma serie e não possui o metodos ".split"
    for x in tweet[tweet['target']==target]['text_clean'].str.split():
        for i in x:
            corpus.append(i)
            break
    return corpus

- Real disasters

In [262]:
corpus_disaster_tweets = create_corpus_df(df, 1)

dic={}
for word in corpus_disaster_tweets:
    if word in dic:
        dic[word]+=1
    else:
        dic[word]=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]
top

[('latest', 33),
 ('watch', 30),
 ('wreckage', 27),
 ('police', 25),
 ('families', 25),
 ('suicide', 24),
 ('rt', 23),
 ('years', 23),
 ('news', 22),
 ('california', 22)]

- Fake disasters

In [263]:
corpus_fake_disaster_tweets = create_corpus_df(df, 0)

dic={}
for word in corpus_fake_disaster_tweets:
    if word in dic:
        dic[word]+=1
    else:
        dic[word]=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]
top

[('new', 58),
 ('dont', 35),
 ('liked', 34),
 ('rt', 25),
 ('one', 20),
 ('love', 19),
 ('people', 17),
 ('hot', 17),
 ('reddit', 17),
 ('feel', 16)]

### 8.4 Modeling

In [264]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
x = df['text_clean']
y = df['target']

# Split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

5709 5709
1904 1904


In [265]:
pipe = Pipeline([
    ('count', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
    ))
])
from sklearn import metrics

# Fit the pipeline with the data
pipe = pipe.fit(x_train, y_train)

In [266]:
from sklearn.metrics import classification_report

y_pred_class = pipe.predict(x_test)

print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

print(metrics.confusion_matrix(y_test, y_pred_class))

print(classification_report(y_test, y_pred_class, target_names=['fake', 'disaster']))

Test: 0.7715336134453782
[[970 121]
 [314 499]]
              precision    recall  f1-score   support

        fake       0.76      0.89      0.82      1091
    disaster       0.80      0.61      0.70       813

    accuracy                           0.77      1904
   macro avg       0.78      0.75      0.76      1904
weighted avg       0.78      0.77      0.77      1904



- Resultado melhor que o anterior

### Saving and submiting

In [267]:
sample = pd.read_csv("./data/sample_submission.csv", index_col="id")
y_submit = pipe.predict(test_df['text_clean'])
sample['target'] = y_submit
sample.to_csv("third_submission.csv")

### 