# Libraries

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re


#NLTK
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

#Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

#SKLEARN, Wordcloud...
from sklearn.model_selection import train_test_split

# Files 

## Dataset load

In [2]:
dataset = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/Dataset/training_dataset.csv", encoding="latin", header=None)

# Show some examples

In [3]:
dataset.head(5)

# Setting headers

In [4]:
column_names = ["target","id","date","query","username","content"]
dataset.columns = column_names

## Result 

In [5]:
dataset.head(5)

# Explore data

## Check for missing values

In [6]:
missing_values = dataset.isna().sum()
missing_values



```
 There are 0 missing values within this dataset
```



## Target distribution 

### Negative samples : *target = 0*

#### Display some examples

In [7]:
negative_samples = dataset[dataset["target"]==0]
negative_samples.head()

#### Show count

In [8]:
negative_samples_count = len(negative_samples)
print(f"Number of negative samples : {negative_samples_count}")

### Neutral samples : *target=2*

#### Display some examples

In [9]:
neutral_samples = dataset[dataset["target"]==2]
neutral_samples.head(5)

#### Show count

In [10]:
neutral_samples_count = len(neutral_samples)
print(f"Number of neutral samples : {neutral_samples_count}")



```
0 neutral sample ? Wow
```



### Positive samples

#### Display some results 

In [11]:
positive_samples = dataset[dataset["target"]==4]
positive_samples.head(5)

#### Show count

In [12]:
positive_samples_count = len(positive_samples)
print(f"Number of positive samples {positive_samples_count}")

## Target distribution plot

#### Defining data

In [13]:
data = [negative_samples_count, neutral_samples_count, positive_samples_count]
labels = ["Negative","Neutral","Positive"]

#### Define color palette

In [14]:
colors = sns.color_palette("pastel")[0:3]

#### Create pie chart

In [15]:
plt.figure(figsize=(6,6))
plt.title("Target distribution")
plt.pie(data, labels=labels, colors=colors, autopct="%.0f%%")
plt.show()

# Preparing data 

## Drop unncessary columns

In [16]:
dataset.drop(["id","date","query","username"],axis=1, inplace=True)

### Result

In [17]:
dataset.head(5)

## Replace 4 by 1 (TRUE)

In [18]:
dataset.target = dataset.target.replace({4: 1})

### Result

In [19]:
dataset.head(5)

In [20]:
dataset[dataset["target"]==1]

## Removing mentions, links, extra spaces from tweets

In [21]:
regex = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

## Stemming

### Define stopwords and stemmers

In [22]:
english_stopwords = stopwords.words("english")
stemmer = SnowballStemmer("english")

### Define stemming function

In [23]:
def preprocess(content, stem=False):
  content = re.sub(regex, ' ', str(content).lower()).strip()
  tokens = []
  for token in content.split():
    if token not in english_stopwords:
      tokens.append(stemmer.stem(token))
  return " ".join(tokens)

### Apply stemming

In [24]:
dataset.content = dataset.content.apply(lambda x: preprocess(x))

### Show results 

In [25]:
dataset.head(10)

## Train/test split

In [26]:
train, test = train_test_split(dataset, test_size=0.1, random_state=44)



```
The dataset is large enough to proceed to a 90-10 split. 
```



### Display results 

In [27]:
print(f"Training set length: {len(train)/1e6}M examples")
print(f"Test set length: {len(test)/1e6}M examples")

## Tokenization

### Define the tokenizer

In [28]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.content)
vocab_size = len(tokenizer.word_index)+1

### Get the max length

In [29]:
content = dataset["content"]

In [30]:
max_length = len(content[0].split())
for tweet in content: 
  length = len(tweet.split())
  if length > max_length: 
    max_length = length

print(f"Maximum token length: {max_length}")

### Define sequences

In [31]:
sequences_train = tokenizer.texts_to_sequences(train.content)
sequences_test = tokenizer.texts_to_sequences(test.content)

### Define X_train, X_test, y_train, y_test, 

In [32]:
X_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

y_train = train.target.values
y_test = test.target.values

print(X_train.shape)

## Word Embeddings (GloVe)

### Setup

In [33]:
embeddings_dictionary = dict()
embedding_dim = 100
glove_file = open("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/Dataset/word_embeddings.txt")

### Apply word embedding

In [34]:
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
    
glove_file.close()

embeddings_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

# Define the model

## Set training parameters

In [35]:
num_epochs = 50
batch_size = 1000

## Create the model

### Define the embedding layer

In [36]:
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False)

### Define early stopping as the model callback

In [37]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=10, mode ="max", verbose=2, restore_best_weights=True)

### Create the model

In [38]:
model = Sequential([
        embedding_layer,
        tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(LSTM(128)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

## Compile the model

In [39]:
model.compile(
       optimizer = Adam(learning_rate = 0.001 ),
       loss="binary_crossentropy", 
       metrics=["accuracy"]
  )

## Model summary 

In [40]:
model.summary()

In [41]:
tf.keras.utils.plot_model(model, show_shapes=True)

# Train the model

In [42]:
history = model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=1, 
    validation_data=(X_test,y_test),
    callbacks = [stop_early]
)

# Save the model

In [43]:
model.save("TSA_model_v1.h5")

# Plot the results

In [47]:
number_of_epochs = len(history.history["loss"])
print(f"Number of epochs: {number_of_epochs}")

In [49]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(number_of_epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [50]:
best_accuracy = max(history.history["val_accuracy"])
print(f"Best validation accuracy : {best_accuracy}")