# Libraries

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re


#NLTK
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

#Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

#SKLEARN, Wordcloud...
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Files 

## Dataset load

In [None]:
dataset = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/Dataset/training_dataset.csv", encoding="latin", header=None)

# Show some examples

In [None]:
dataset.head(5)

# Setting headers

In [None]:
column_names = ["target","id","date","query","username","content"]
dataset.columns = column_names

## Result 

In [None]:
dataset.head(5)

# Explore data

## Check for missing values

In [None]:
missing_values = dataset.isna().sum()
missing_values



```
 There are 0 missing values within this dataset
```



## Target distribution 

### Negative samples : *target = 0*

#### Display some examples

In [None]:
negative_samples = dataset[dataset["target"]==0]
negative_samples.head()

#### Show count

In [None]:
negative_samples_count = len(negative_samples)
print(f"Number of negative samples : {negative_samples_count}")

### Neutral samples : *target=2*

#### Display some examples

In [None]:
neutral_samples = dataset[dataset["target"]==2]
neutral_samples.head(5)

#### Show count

In [None]:
neutral_samples_count = len(neutral_samples)
print(f"Number of neutral samples : {neutral_samples_count}")



```
0 neutral sample ? Wow
```



### Positive samples

#### Display some results 

In [None]:
positive_samples = dataset[dataset["target"]==4]
positive_samples.head(5)

#### Show count

In [None]:
positive_samples_count = len(positive_samples)
print(f"Number of positive samples {positive_samples_count}")

## Target distribution plot

#### Defining data

In [None]:
data = [negative_samples_count, neutral_samples_count, positive_samples_count]
labels = ["Negative","Neutral","Positive"]

#### Define color palette

In [None]:
colors = sns.color_palette("pastel")[0:3]

#### Create pie chart

In [None]:
plt.figure(figsize=(6,6))
plt.title("Target distribution")
plt.pie(data, labels=labels, colors=colors, autopct="%.0f%%")
plt.show()

# Preparing data 

## Drop unncessary columns

In [None]:
dataset.drop(["id","date","query","username"],axis=1, inplace=True)

### Result

In [None]:
dataset.head(5)

## Replace 4 by 1 (TRUE)

In [None]:
dataset.target = dataset.target.replace({4: 1})

### Result

In [None]:
dataset.head(5)

In [None]:
dataset[dataset["target"]==1]

## Removing mentions, links, extra spaces from tweets

In [None]:
regex = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

### Apply preprocessing

In [None]:
dataset.content = dataset.content.apply(lambda x: re.sub(regex, ' ', str(x).lower()).strip())

### Show results 

In [None]:
dataset.head(10)

## Train/test split

In [None]:
train, test = train_test_split(dataset, test_size=0.1, random_state=44)



```
The dataset is large enough to proceed to a 90-10 split. 
```



### Display results 

In [None]:
print(f"Training set length: {len(train)/1e6}M examples")
print(f"Test set length: {len(test)/1e6}M examples")

## Tokenization

### Define the tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.content)
vocab_size = len(tokenizer.word_index)+1

In [None]:
print(f"Word index length: {len(tokenizer.word_index)}")
print(f"Some words: {list(tokenizer.word_index.keys())[0:10]}")

### Get the max length

In [None]:
content = dataset["content"]

In [None]:
max_length = len(content[0].split())
for tweet in content: 
  length = len(tweet.split())
  if length > max_length: 
    max_length = length

print(f"Maximum token length: {max_length}")

### Define sequences

In [None]:
sequences_train = tokenizer.texts_to_sequences(train.content)
sequences_test = tokenizer.texts_to_sequences(test.content)

### Define X_train, X_test, y_train, y_test, 

In [None]:
X_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
X_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

y_train = train.target.values
y_test = test.target.values

print(X_train.shape)

## Word Embeddings (GloVe)

### Setup

In [None]:
embeddings_dictionary = dict()
embedding_dim = 100
glove_file = open("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/Dataset/word_embeddings.txt")

### Apply word embedding

In [None]:
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
    
glove_file.close()

embeddings_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

# Define the model

## Set training parameters

In [None]:
num_epochs = 50
batch_size = 1000

## Create the model

### Define the embedding layer

In [None]:
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False)

### Define early stopping as the model callback

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=10, mode ="max", verbose=2, restore_best_weights=True)

### Create the model

In [None]:
model = Sequential([
        embedding_layer,
        tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(LSTM(128)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

## Compile the model

In [None]:
model.compile(
       optimizer = Adam(learning_rate = 0.001 ),
       loss="binary_crossentropy", 
       metrics=["accuracy"]
  )

## Model summary 

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

# Train the model

In [None]:
history = model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=1, 
    validation_data=(X_test,y_test),
    callbacks = [stop_early]
)

# Save the model

 ## Define text vectorization 

In [None]:
vocabulary = list(tokenizer.word_index.keys())

In [None]:
input_shape = tf.keras.Input(shape=(max_length))
shape = input_shape.shape
shape

In [None]:
vectorize_layer = TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=vocab_size+1,
    output_mode='int',
    output_sequence_length=max_length, 
    vocabulary=vocabulary
)
vectorize_layer.adapt(train["content"].values)

## Define the export model

In [None]:
export_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(1,), dtype=tf.string),
    vectorize_layer,
    model,
    Activation('sigmoid')
])

## Compile

In [None]:
export_model.compile(
       optimizer = Adam(learning_rate = 0.001 ),
       loss="binary_crossentropy", 
       metrics=["accuracy"]
)

## Save

In [None]:
export_model.save("TSA_model_v3")

# Plot the results

In [None]:
number_of_epochs = len(history.history["loss"])
print(f"Number of epochs: {number_of_epochs}")

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(number_of_epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
best_accuracy = max(history.history["val_accuracy"])
print(f"Best validation accuracy : {best_accuracy}")

## Print f1-score, precision and recall

In [None]:
predictions = model.predict(X_test)
predictions = np.where(predictions<0.5, 0, 1)

print(classification_report(y_test, predictions))

## Test

In [None]:
examples = ["being sick can be really cheap when it hurts too much to eat real food plus your friends make you soup"]

print("Examples: ", examples, " Type: ", type(examples))

export_model.predict(examples)

# Tune the threshold (experimental)

In [None]:
predictions = model.predict(X_test)

# Calculate the best f1 score for each threshold

In [None]:
thresholds = np.arange(0, 1, 0.001)
thres = 0.1
f1_score = 0.001

## Positive

In [None]:
for threshold in thresholds :
    prediction = np.where(predictions>threshold, 1, 0)
    report = classification_report(y_test, prediction, output_dict=True)
    f1_score_new = report["1"]["f1-score"]
    if f1_score_new > f1_score: 
        f1_score = f1_score_new
        thres = threshold
                      
print(f"Best threshold : {thres} \nBest f1_score: {f1_score}")

## Negative

In [None]:
thres_neg = 0.1
f1_score_neg = 0.001
for threshold in thresholds :
    prediction = np.where(predictions>threshold, 1, 0)
    report = classification_report(y_test, prediction, output_dict=True)
    f1_score_new = report["0"]["f1-score"]
    if f1_score_new > f1_score_neg: 
        f1_score_neg = f1_score_new
        thres_neg = threshold
                      
print(f"Best threshold : {thres_neg} \nBest f1_score: {f1_score_neg}")

In [None]:
examples = [ "this is a very good day, don't you think so ?" ]

print("Examples: ", examples, " Type: ", type(examples))

export_model.predict(examples)