In [None]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from sklearn.ensemble import RandomForestClassifier
from nltk import pos_tag, word_tokenize
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt # we only need pyplot
import seaborn as sb
sb.set()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# A helper function for creating Confusion Matrix and displaying the report
def Confusion_Matrix(y_true, y_pred, labels, fold_count):
    
    # Create a confusion matrix
    conf_mat = confusion_matrix(y_true, y_pred, labels=labels)
                                                        
    plt.figure(figsize = (10,5))
    sb.set(font_scale=1.2)
    sb.heatmap(conf_mat, annot=True, cmap="YlGnBu", fmt='d', xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix for fold " + str(fold_count))
    plt.show()
    
    print(f"Classification Report:\n\n",classification_report(y_true, y_pred))
    print("-"*100)

In [None]:
clean_twitter_data = pd.read_csv("dataset/clean_twitter_data.csv", index_col = [0],encoding="utf-8")
clean_twitter_data.info()

In [None]:
clean_twitter_data = pd.read_csv("dataset/clean_twitter_data.csv", index_col = [0],encoding="utf-8")
clean_twitter_data.info()

In [None]:
# There are tweets where after pre-processing are just empty strings
clean_twitter_data.dropna(inplace=True)
clean_twitter_data.reset_index(inplace=True)

In [None]:
# See what is the longest string of tweet
text_length =  clean_twitter_data.clean_tweets.str.len()
max_len = max(text_length)
max_len  

In [None]:
W2V_SIZE = 300 # represents how big your dimensional size of your word2vec, 
W2V_WINDOW = 2 # Window_Size refers to the number of words on either side of a target word that are used to predict the target word
              # so if size = 5 , the window is (n-5),(n-4), (n-3)....n, (n+1)....(n+5)word
W2V_EPOCH = 32 # Number of epochs to train the word2vec
W2V_MIN_COUNT = 2 # This mean that, if the word that occurs less than 2 times, will be drop away from the dictionary

word2vec_model = Word2Vec(vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=8)

In [None]:
tokens = [word_tokenize(text) for text in clean_twitter_data.clean_tweets]
tokens

In [None]:
word2vec_model.build_vocab(tokens)

In [None]:
X, x_test, Y, y_test = train_test_split(clean_twitter_data['clean_tweets'], clean_twitter_data['class'], test_size=0.2, random_state=42)

In [None]:
Y = pd.DataFrame(Y)
X = pd.DataFrame(X)

In [None]:
tokenizer = Tokenizer()

# Tokenization helps to reduce the dimensionality of the data, making it easier to analyze and process.
tokenizer.fit_on_texts(clean_twitter_data.clean_tweets)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size) # Shows the vocacbulary size of the datasets

In [None]:
def get_word_embedding(doc, w2v_model, embedding_dim):
    words = doc.split()
    # filter out-of-vocabulary words
    words = [word for word in words if word in w2v_model.wv.key_to_index]
    if len(words) > 0:
        return w2v_model.wv.get_mean_vector(words)
    else:
        return np.zeros(embedding_dim)
    


In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
X_test_embeddings = [get_word_embedding(doc, word2vec_model, embedding_dim=W2V_SIZE) for doc in x_test]

In [None]:
rf_history = {}
# K-Fold validation
num_classes = 3
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
fold_count = 1

rf_1 = None
rf_2 = None
rf_3 = None
rf_4 = None
rf_5 = None

rf_list = [rf_1, rf_2, rf_3, rf_4, rf_5]

for train_index, val_index in kf.split(X):
    le = LabelEncoder()
    X_train, X_val =X["clean_tweets"].iloc[train_index],X["clean_tweets"].iloc[val_index]
    y_train, y_val = Y["class"].iloc[train_index], Y["class"].iloc[val_index]
    
    X_train_embeddings = [get_word_embedding(doc, word2vec_model, embedding_dim=W2V_SIZE) for doc in X_train]
    X_val_embeddings = [get_word_embedding(doc, word2vec_model, embedding_dim=W2V_SIZE) for doc in X_val]
    
    print(y_train)
    scaler_Word2Vec = MinMaxScaler(feature_range=(0,1))
    X_train_embeddings = scaler_Word2Vec.fit_transform(X_train_embeddings)
    X_val_embeddings = scaler_Word2Vec.transform(X_val_embeddings)
    X_test_scaled = scaler_Word2Vec.transform(X_test_embeddings)
    
    # label encoding for our categorical variable
    y_train = le.fit_transform(y_train)
    y_val = le.transform(y_val)
    y_test_encoded = le.transform(y_test)
    
    # One-hot encode the target variable
    y_train = to_categorical(y_train, num_classes)
    y_val = to_categorical(y_val, num_classes)
    y_test_encoded = to_categorical(y_test_encoded, num_classes)
    
    rf_list[fold_count-1] = RandomForestClassifier(n_estimators=100, random_state=42)
    # fit the model on the training data
    rf_list[fold_count-1].fit(X_train_embeddings, y_train)
    
    y_pred = rf_list[fold_count-1].predict(X_test_scaled)
    y_pred = np.argmax(y_pred, axis=1)
    y_pred_labels = le.inverse_transform(y_pred)
    
    y_test_labels = np.argmax(y_test_encoded, axis=1)
    y_test_labels = le.inverse_transform(y_test_labels)
    Confusion_Matrix( y_test_labels, y_pred_labels, np.unique(clean_twitter_data["class"]), fold_count)
    fold_count+=1