# Data Preparing

In [32]:
import pandas as pd

In [33]:
df = pd.read_csv('tweets.csv',encoding= 'latin-1',header = None) ### download data from https://www.kaggle.com/kazanova/sentiment140
df = df.sample(frac = 1) ### shuffle the data

In [34]:
df = df.rename(columns={0: 'target', 1: 'id', 2: 'date', 3: 'query', 4: 'username', 5: 'content'}) # add names for columns

In [35]:
print(df.info()) # check for nulls

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600000 entries, 237705 to 633129
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   target    1600000 non-null  int64 
 1   id        1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   query     1600000 non-null  object
 4   username  1600000 non-null  object
 5   content   1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 85.4+ MB
None


In [36]:
df = df.drop(['id','date','query','username'],axis=1) # drop unimportant columns

In [37]:
### in target column 0 is unhappy and 4 is happy ###
### here just replaced 4 with 1 just to make more sense ###
df.target = df.target.replace({4:1}) 

In [38]:
import re
import nltk
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
#stop-words
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahmo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:

stop_words=set(nltk.corpus.stopwords.words('english'))
stemmer = SnowballStemmer('english')
regex = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
# the upper regex can't detect urls starting with www. but can detect mentions the lower one can detect urls starting with www. but can't detect mentions
# regex => (http://)[^ ]*|(https://)[^ ]*|(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})

def preprocess(content, stem=False):
  content = re.sub(regex, ' ', str(content).lower()).strip()
  tokens = []
  for token in content.split():
    tokens.append(stemmer.stem(token))
  return " ".join(tokens)


In [40]:
df.content = df.content.apply(lambda x: preprocess(x))
df.head(7)

Unnamed: 0,target,content
237705,0,hug take it easi
450282,0,i use to have eo manip but i threw them away w...
820399,1,yea i just need to put my dress on and stuff
1049909,1,i like those word of wisdom
646622,0,i want corona
623656,0,164 with ship he is awesom just suck i ve had ...
368004,0,not look forward to tmw at present to open my ...


# train test split

In [None]:
train, test = train_test_split(data, test_size=0.1, random_state=44)

In [None]:
print('Train dataset shape: {}'.format(train.shape))
print('Test dataset shape: {}'.format(test.shape))

Train dataset shape: (1440000, 2)
Test dataset shape: (160000, 2)


# Tokenization

In [None]:
def getMaxWordLength(content):
    max_len = 0
    for doc in data:
        for i in doc.split():
            if len(i) > max_len:
                max_len = len(i)
        max_len = max(max_len, len(doc))
    return max_len

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.content)  
vocab_size = len(tokenizer.word_index) + 1 
max_length = 50

In [None]:
sequences_train = tokenizer.texts_to_sequences(train.content) 
sequences_test = tokenizer.texts_to_sequences(test.content) 

X_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

y_train = train.target.values
y_test = test.target.values

# Word Embedding using Glove

In [None]:
def getMaxWordLength(data):
    max_len = 0
    for doc in data:
        for i in doc.split():
            if len(i) > max_len:
                max_len = len(i)
        max_len = max(max_len, len(doc))
    return max_len

In [None]:
embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt') ### the file is large so you can download it from https://nlp.stanford.edu/projects/glove/ or search for it on google

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
    
glove_file.close()

embedding_dim = 100 # embeddings_dictionary[any Existent word].shape[0]
embeddings_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

# Model Creation

In [45]:
import numpy as np
#numpy
import matplotlib.pyplot as plt
#matplotlib
import seaborn as sns
#seaborn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
#sklearn
import tensorflow as tf
#tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
#keras

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False)
num_epochs = 10
batch_size = 1000

In [None]:
model = Sequential([
        embedding_layer,
        tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Bidirectional(LSTM(128)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size = batch_size, epochs=num_epochs, validation_data=(X_test, y_test), verbose=2)

# Evalute the model

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
#History for accuracy
plt.figure(figsize=(10,5))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train accuracy', 'Test accuracy'], loc='lower right')
plt.show()
# History for loss
plt.figure(figsize=(10,5))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train loss', 'Test loss'], loc='upper right')
plt.suptitle('Accuracy and loss for second model')
plt.show()