In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import nltk    
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer 
import re
import string
stopwords= stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Ujjval
[nltk_data]     Priyadarshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
import tensorflow as tf
import keras

Using TensorFlow backend.


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from nltk.tokenize import word_tokenize

In [6]:
dt= pd.read_csv('train.csv')

In [7]:
dt.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


<h1><b> Data Cleaning </b></h1>
Here we are gonna clean the DF. Specifically, we clean:

stopwords (Kept cause removing them cause drop of performances)
<li>URL</li>
<li>HTML</li>
<li>emoji</li>
<li>punctuation</li>

In [9]:
def clean_df(df):
    def remove_stopwords(text):
        if text is not None:
            tokens = [x for x in word_tokenize(text) if x not in stopwords]
            return " ".join(tokens)
        else:
            return None
       
    df["text"] = df['text'].apply(lambda x : x.lower())
    
    df['text']= df['text'].apply(lambda x : remove_stopwords(x))
    
    
    def remove_URL(text):
        url = re.compile(r'https?://\S+|www\.\S+')
        return url.sub(r'',text)
    
    df['text']=df['text'].apply(lambda x : remove_URL(x))
    
    
    def remove_html(text):
        html=re.compile(r'<.*?>')
        return html.sub(r'',text)
    
    df['text']=df['text'].apply(lambda x : remove_html(x))
    
    
    def remove_emoji(text):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    df['text']=df['text'].apply(lambda x: remove_emoji(x))
    
    
    def remove_punct(text):
        table=str.maketrans('','',string.punctuation)
        return text.translate(table)

    df['text']=df['text'].apply(lambda x : remove_punct(x))
    
    
    df.text = df.text.replace('\s+', ' ', regex=True)
    return df  

In [10]:
df = clean_df(dt)

In [11]:
list_corpus = df["text"].tolist()
list_labels = df["target"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=40)

In [13]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(X_train)
padded = pad_sequences(sequences,maxlen=max_length, truncating='post')

In [14]:
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_____________________

In [16]:
num_epochs = 10
history = model.fit(padded, y_train, epochs=num_epochs, validation_data=(testing_padded, y_test))

Train on 6090 samples, validate on 1523 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
pred_dp= model.predict(testing_padded)

In [18]:
output = []
for val in pred_dp:
    if val > 0.5:
        output.append(1)
    else:
        output.append(0)

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [20]:
accuracy_score(output, y_test)

0.81483913328956

In [21]:
confusion_matrix(output, y_test)

array([[747, 177],
       [105, 494]], dtype=int64)

In [22]:
dt_test= pd.read_csv('test.csv')

In [23]:
dt_test = clean_df(dt_test)

In [24]:
test_corpus= dt_test['text']

In [25]:
sequences_test = tokenizer.texts_to_sequences(test_corpus)
padded_test = pad_sequences(sequences_test,maxlen=max_length, truncating='post')

In [26]:
pred= model.predict(padded_test)

In [27]:
final_output = []
for val in pred:
    if val > 0.5:
        final_output.append(1)
    else:
        final_output.append(0)

In [28]:
len(dt_test)

3263

In [30]:
s= pd.read_csv("sample_submission.csv")

submission = pd.DataFrame({
        "id": s["id"],
        "target": final_output
    })

submission.to_csv('submission_nlp.csv', index=False)  