In [96]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
train_df_clean = pd.read_csv("../Data/processed/Train_clean.csv")
val_df_clean = pd.read_csv("../Data/processed/Valid_clean.csv")

In [101]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_tr = tfidf_vectorizer.fit_transform(train_df_clean['text_cleaned'])
tfidf_val = tfidf_vectorizer.transform(val_df_clean['text_cleaned'])
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# tfidf_df = pd.DataFrame(values.toarray(), columns = tfidf_feature_names)

In [102]:
X_train, y_train = tfidf_tr, train_df_clean['label']
X_train.shape, y_train.shape

((40000, 186272), (40000,))

In [103]:
X_test, y_test = tfidf_val, val_df_clean['label']
X_test.shape, y_test.shape

((5000, 186272), (5000,))

## Model - Training

### 1- ML models

`Preprocessing`: Text -> Vectroization(default) -> tf-idf feature extraction

**NOTE**: vectorization is not used explictily as tfidf sklearn method uses word analyzer to make the vector out of text for later encoding. ngram_range used is only (1,1) 

In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

In [105]:
# Model definitions
LogReg_model = LogisticRegression()
RandomForestClassifier_model = RandomForestClassifier(max_depth=3, random_state=0)

MultinomialNB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')
KNeighborsClassifier_model = KNeighborsClassifier(n_neighbors=3)
DecisionTreeClassifier_model = DecisionTreeClassifier(random_state=0)

models = [LogReg_model, RandomForestClassifier_model, DecisionTreeClassifier_model,
          SGDClassifier_model,   
          KNeighborsClassifier_model,  MultinomialNB_model]
model_names = ['LogisticRegression', 'RandomForestClassifier', 'DecisionTreeClassifier',
               'SGDClassifier', 'KNeighborsClassifier', 'MultinomialNB']

In [106]:
def train_models(X_tr, X_te, y_tr, y_te):
    for i, model in enumerate(models):
        print(f"Model: {model_names[i]}")
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)
        print('val accuracy %s' % accuracy_score(y_te, y_pred))
        # print(classification_report(y_te, y_pred))  # for further evaluation
        
        print()

In [107]:
train_models(X_train, X_test, y_train, y_test)

Model: LogisticRegression
accuracy 0.8858

Model: RandomForestClassifier
accuracy 0.782

Model: DecisionTreeClassifier
accuracy 0.7148

Model: SGDClassifier
accuracy 0.86

Model: KNeighborsClassifier
accuracy 0.7798

Model: MultinomialNB
accuracy 0.862



### 2- DL Models - LSTM

`Preprocessing`: Text -> sequence data embedding 

In [80]:
import tensorflow as tf
import keras
from keras.models import Sequential
from  tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [81]:
train_df_clean

Unnamed: 0,text,label,text_cleaned
0,I grew up (b. 1965) watching and loving the Th...,0,i grew b watching loving thunderbirds all mate...
1,"When I put this movie in my DVD player, and sa...",0,when i put movie dvd player sat coke chips i e...
2,Why do people who do not know what a particula...,0,why people know particular time past like feel...
3,Even though I have great interest in Biblical ...,0,even though i great interest biblical movies i...
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dads army fan nothing ever change ...
...,...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1,western union something forgotten classic west...
39996,This movie is an incredible piece of work. It ...,1,this movie incredible piece work it explores e...
39997,My wife and I watched this movie because we pl...,0,my wife i watched movie plan visit sicily stro...
39998,"When I first watched Flatliners, I was amazed....",1,when i first watched flatliners i amazed it ne...


In [82]:
max_len = np.max(train_df_clean['text_cleaned'].apply(lambda x :len(x)))

In [83]:
tokenizer = keras.preprocessing.text.Tokenizer(oov_token  = '<OOV>')

tokenizer.fit_on_texts(train_df_clean['text_cleaned'])

vocab_length = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index

print('vocabulary length', vocab_length, end='\n\n')
print(word_index)

In [86]:
train_sequences = tokenizer.texts_to_sequences(train_df_clean['text_cleaned'])
val_sequences = tokenizer.texts_to_sequences(val_df_clean['text_cleaned'])

In [87]:
X_train =  pad_sequences(
    sequences=train_sequences,
    maxlen=int(max_len), 
    padding="pre", 
    truncating="pre", 
    value=0
)
y_train = train_df_clean['label'].copy()
X_test = pad_sequences(
    sequences=val_sequences,
    maxlen=int(max_len), 
    padding="pre", 
    truncating="pre", 
    value=0
)
y_test = val_df_clean['label'].copy()

# Is the total vocabulary size + padding token
num_features = len(tokenizer.index_word) + 1

In [89]:
embedding_dim = 16

lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=max_len),
    tf.keras.layers.LSTM(units=64, return_sequences=False),
    # tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# opt = tf.keras.optimizers.Adam(learning_rate=0.01)
lstm.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
lstm.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d3eaa538e0>

Next, Work on:
- hyperparameter tuning on best models
- build other models (DL or ML)
- solve overfitting issue