# 1. Import required Dependencies

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding



# 2. Prepare Dataset

In [2]:
data = pd.read_csv("DL_Dataset_final.csv")
df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59176 entries, 0 to 59175
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  59176 non-null  int64 
 1   text        59176 non-null  object
 2   label       59176 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.4+ MB


In [3]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59176 entries, 0 to 59175
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    59176 non-null  object
 1   label   59176 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 924.8+ KB


# 4. One hot encoding using tensorflow

In [4]:
# easier to get vocabulary size for large datasets
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
BoW = vect.fit_transform(df['text'])

vocab_size = len(vect.vocabulary_)

In [5]:
# create one-hot encoded text
encoded_text = [one_hot(t, vocab_size) for t in df['text']]
encoded_text

[[17319, 38698, 25321, 26662, 24368, 17319, 25316, 10465, 11637, 35158, 38552],
 [17319, 9528, 17098, 36869, 29321, 19445, 27183, 17028],
 [19445,
  30288,
  13446,
  19025,
  39373,
  15349,
  28605,
  29321,
  19445,
  12573,
  4366,
  4823,
  14802,
  19025,
  30989,
  27183,
  30338,
  19445,
  10680,
  35219,
  16127,
  19025],
 [19445,
  8031,
  16127,
  17098,
  33427,
  35517,
  24355,
  37722,
  21708,
  26494,
  5021,
  17098,
  13367,
  38698,
  30929,
  16797,
  37536,
  7880,
  35294,
  19445,
  957],
 [19445, 13857, 38698, 28778, 16323, 40252, 35294, 19445, 3865, 14350, 37245],
 [19445,
  29995,
  25522,
  2782,
  28778,
  18292,
  28117,
  11043,
  19445,
  10151,
  25316,
  27183,
  30179],
 [19445, 31556, 16443, 35294, 12814, 908, 997, 35517, 28778, 40184, 37605],
 [19445,
  34267,
  31788,
  19445,
  11913,
  30989,
  28778,
  17965,
  16127,
  28778,
  26430,
  24552,
  13857,
  29187,
  26973,
  4142,
  485,
  37722,
  24768,
  38976,
  908,
  3419,
  19445,
  11191

In [6]:
# identify max length of a text input
max_len = 0
for s in encoded_text:
    if len(s) > max_len:
        max_len = len(s)

max_len

66

In [7]:
# pad text to match max length
padded_text = pad_sequences(encoded_text, maxlen= max_len, padding= 'post')
padded_text

array([[17319, 38698, 25321, ...,     0,     0,     0],
       [17319,  9528, 17098, ...,     0,     0,     0],
       [19445, 30288, 13446, ...,     0,     0,     0],
       ...,
       [  378, 17906, 19941, ...,     0,     0,     0],
       [ 5692,  5313, 35158, ...,     0,     0,     0],
       [14106, 12367, 15349, ...,     0,     0,     0]])

# 5. Create Model

In [8]:
embedded_vector_size = 20

model = Sequential()
model.add(Embedding(vocab_size, embedded_vector_size, input_length= max_len, name= 'embedding'))
model.add(Flatten())
model.add(Dense(1, activation= 'sigmoid'))

In [9]:
model.compile(optimizer= 'adam', loss= 'categorical_crossentropy', metrics= ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 66, 20)            806680    
                                                                 
 flatten (Flatten)           (None, 1320)              0         
                                                                 
 dense (Dense)               (None, 1)                 1321      
                                                                 
Total params: 808001 (3.08 MB)
Trainable params: 808001 (3.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
X = padded_text
Y = df['label']


In [11]:
model.fit(X, Y, epochs= 10)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x12dc68f6910>

In [12]:
loss, accuracy = model.evaluate(X, Y)



# 6. Getting Word Embeddings

In [13]:
weights = model.get_layer('embedding').get_weights()[0]
weights

array([[ 2.0052792e+01, -2.0058672e+01,  2.0022631e+01, ...,
         2.0054735e+01, -2.0001045e+01, -2.0065039e+01],
       [ 3.9281318e+00, -3.9286718e+00,  3.8625839e+00, ...,
         3.8420968e+00, -3.8876445e+00, -3.8860219e+00],
       [-1.4659084e-02,  4.5746412e-02, -3.2461420e-02, ...,
        -1.9287873e-02,  1.5542507e-03, -4.1920889e-02],
       ...,
       [ 2.2351329e+00, -2.2399757e+00,  2.2184296e+00, ...,
         2.2685623e+00, -2.2948678e+00, -2.2522943e+00],
       [-3.8568951e-02,  4.4065330e-02,  3.5681043e-02, ...,
        -1.1042941e-02,  3.2672871e-02, -4.6136893e-02],
       [ 3.2726396e-02, -1.7507516e-02, -1.5118789e-02, ...,
        -7.2188601e-03, -2.1317229e-03,  2.0327423e-02]], dtype=float32)

In [14]:
print(len(weights))

40334


# 7. Create Machine Learning Model for prediction

In [15]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [16]:
# split into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size= 0.8, random_state= 2)

In [17]:
print("X train shape : {} \nX test shape: {} \nY train shape: {} \nY test shape: {}\n".format(X_train.shape, X_test.shape, y_train.shape, y_test.shape))

X train shape : (47340, 66) 
X test shape: (11836, 66) 
Y train shape: (47340,) 
Y test shape: (11836,)



In [18]:
# Creating NAIVE BAYES MODEL for Sentiment predicting
model1 = MultinomialNB()
model1.fit(X_train, y_train)

MultinomialNB()

In [19]:
# testing model 
y_pred = model1.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.26      0.02      0.03      2718
           1       0.44      0.35      0.39      4590
           2       0.07      0.00      0.01      1078
           3       0.14      0.13      0.14      1301
           4       0.19      0.61      0.30      2149

    accuracy                           0.26     11836
   macro avg       0.22      0.22      0.17     11836
weighted avg       0.29      0.26      0.23     11836



In [20]:
# creating possible hyperparameter values for SVM GridSearchCV
grid = {
    'kernel' : ["linear", "rbf", "sigmoid"],
    'C' : [0.01, 0.1, 1, 10],
    'gamma' : [0.01, 1]
}

In [21]:
model6 = SVC() 
model6.fit(X_train, y_train)

SVC()

In [24]:
score6 = model6.score(X_test, y_test)
print("Model 6 score : ", score6)

Model 6 score :  0.38906725245015206


In [21]:
# Creating SVM CLASSIFICATION MODEL 1 for Sentiment predicting
model2 = SVC(kernel= 'linear')
model2.fit(X_train, y_train)  

In [None]:
# Creating SVM CLASSIFICATION MODEL 2 for Sentiment predicting
model3 = SVC(kernel= 'polynomial')
model3.fit(X_train, y_train)  

In [27]:
# Creating SVM CLASSIFICATION MODEL 3 for Sentiment predicting
model4 = SVC(kernel= 'rbf')
model4.fit(X_train, y_train)  

SVC()

In [28]:
score4 = model4.score(X_test, y_test)
print("Model 4 score : ", score4)

Model 4 score :  0.38906725245015206


In [29]:
# Creating SVM CLASSIFICATION MODEL 4 for Sentiment predicting (SCRAP)
model5 = SVC(kernel= 'sigmoid') 
model5.fit(X_train, y_train)  

SVC(kernel='sigmoid')

In [30]:
score5 = model5.score(X_test, y_test)
print("Model 5 score : ", score5)

Model 5 score :  0.2636025684352822


In [None]:
# check the hyperparameters tuned by the GridSearchCV
#print(tuned_model2.best_params_)