# **Importing Prerequisites**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve

import re
import string
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
lemma = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding

# **Loading Dataset**

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

# **Data Cleaning/Preprocessing**

#### *Removing HTML tags, emojis and punctuation marks*

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions

train['clean_text'] = train['text'].apply(lambda x: remove_URL(x))
train['clean_text'] = train['clean_text'].apply(lambda x: remove_emoji(x))
train['clean_text'] = train['clean_text'].apply(lambda x: remove_html(x))
train['clean_text'] = train['clean_text'].apply(lambda x: remove_punct(x))

In [None]:
# Tokenizing the cleaned texts.

train['tokenized'] = train['clean_text'].apply(word_tokenize)
train.head()

In [None]:
train['lower'] = train['tokenized'].apply(
    lambda x: [word.lower() for word in x])

train['no_stopwords'] = train['lower'].apply(
    lambda x: [word for word in x if word not in set(stopwords.words('english'))])

In [None]:
train['no_stopwords'] = [' '.join(map(str, l)) for l in train['no_stopwords']]

In [None]:
train.head()

In [None]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
# Load pretrained GloVe embeddings

embeddings_index = dict()
f = open('../input/glove6b50dtxt/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
max_len_tweet = train.no_stopwords.apply(lambda x: len(x.split())).max()

tok = Tokenizer()
tok.fit_on_texts(train.no_stopwords)
vocab_size = len(tok.word_index) + 1
encoded_tweet = tok.texts_to_sequences(train.no_stopwords)
padded_tweet = pad_sequences(encoded_tweet, maxlen=max_len_tweet, padding='post')

vocab_size = len(tok.word_index) + 1

tweet_embedding_matrix = np.zeros((vocab_size, 50))
for word, i in tok.word_index.items():
    t_embedding_vector = embeddings_index.get(word)
    if t_embedding_vector is not None:
        tweet_embedding_matrix[i] = t_embedding_vector

# **Creating our model**

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_len_tweet, weights=[tweet_embedding_matrix], trainable=True))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

# **Compiling our model**

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'mae'])

# **Training our model**

In [None]:
model.fit(padded_tweet, train.target, epochs=10, batch_size= 32)

# **Cleaning out test model**

In [None]:
test['clean_text'] = test['text'].apply(lambda x: remove_URL(x))
test['clean_text'] = test['clean_text'].apply(lambda x: remove_emoji(x))
test['clean_text'] = test['clean_text'].apply(lambda x: remove_html(x))
test['clean_text'] = test['clean_text'].apply(lambda x: remove_punct(x))

test['tokenized'] = test['clean_text'].apply(word_tokenize)

test['lower'] = test['tokenized'].apply(
    lambda x: [word.lower() for word in x])

test['no_stopwords'] = test['lower'].apply(
    lambda x: [word for word in x if word not in set(nltk.corpus.stopwords.words('english'))])

test['no_stopwords'] = [' '.join(map(str, l)) for l in test['no_stopwords']]

In [None]:
max_len_test = test.no_stopwords.apply(lambda x: len(x.split())).max()

tok_test = Tokenizer()
tok_test.fit_on_texts(test.no_stopwords)
vocab_size_test = len(tok_test.word_index) + 1
encoded_test = tok_test.texts_to_sequences(test.no_stopwords)
padded_test = pad_sequences(encoded_test, maxlen=max_len_test, padding='post')

vocab_size_test = len(tok_test.word_index) + 1

# **Making Predictions**

In [None]:
preds = model.predict(padded_test)

In [None]:
pred=[1 if i>0.5 else 0 for i in preds]
#preds.shape

In [None]:
submission=pd.DataFrame()
submission['id']=test['id'].to_list()
submission['target']=pred

# **Generate a CSV file of our predictions**

In [None]:
submission.to_csv('submission.csv',index=False)