# **Gensim Encoding**

To perform gensim encoding on the given datset and train with several ML models.


Gensim is the most used Machine Learning library to download and manage text encoders. All text encoders you are going to use are pre-trained, meaning that are neural networks that have already been tuned on Gigabytes of data by renowned experts. It is very common to use pre-trained models in NLP. Gensim generates the vector of size 300.

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/drive/MyDrive/nlp/spacy_preprocessed_labeledtext.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,File Name,Caption,LABEL
0,0,1.txt,feel today legday jelly ache gym,negative
1,1,10.txt,absolute disgrace carriage Bangor half way sta...,negative
2,2,100.txt,Valentine 1 nephew elated little thing big goo...,positive
3,3,1000.txt,betterfeelingfilm RT Instagram day film powerl...,neutral
4,4,1001.txt,Zoe love rattle,positive


In [None]:
df['label_num'] = df['LABEL'].map({'neutral' : 0, 'positive': 1,'negative':2})

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4869 entries, 0 to 4868
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4869 non-null   int64 
 1   File Name   4869 non-null   object
 2   Caption     4770 non-null   object
 3   LABEL       4869 non-null   object
 4   label_num   4869 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 190.3+ KB


In [None]:
df['Caption']= df['Caption'].astype(str)

In [None]:
df

Unnamed: 0.1,Unnamed: 0,File Name,Caption,LABEL,label_num
0,0,1.txt,feel today legday jelly ache gym,negative,2
1,1,10.txt,absolute disgrace carriage Bangor half way sta...,negative,2
2,2,100.txt,Valentine 1 nephew elated little thing big goo...,positive,1
3,3,1000.txt,betterfeelingfilm RT Instagram day film powerl...,neutral,0
4,4,1001.txt,Zoe love rattle,positive,1
...,...,...,...,...,...
4864,4864,995.txt,OMG Eskom Man die LoadShedding powerless,positive,1
4865,4865,996.txt,Feelin love ValentinesDay care,positive,1
4866,4866,997.txt,blue eye beat,neutral,0
4867,4867,998.txt,LA CHUCHA LOUUU TE CHUPO LOS OJOS,neutral,0


In [None]:
# Train a Word2Vec model on the preprocessed text data
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import numpy as np


num_features=100

# Convert the cleaned & tokenized movie review text data into a list of lists of words
sentences = []
for review in df.Caption.values:
    sentences.append(review.split())

model=Word2Vec(sentences,  vector_size=num_features)
# For each review, convert the sequence of words into a fixed-length vector representation
#use a pre-trained model: convert words to vectors and averages all the vectors

def make_feature_vec(words, model,num_features):
    # Function to average all of the word vectors in a given paragraph
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in model.wv.key_to_index:
            feature_vec = np.add(feature_vec, model.wv.get_vector(word))
            nwords += 1
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

def get_avg_feature_vecs(reviews, model, num_features):
    # Function to generate vectors for all movie reviews in a dataset
    counter = 0
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter += 1
    return review_feature_vecs

# Convert the training and test data into fixed-length feature vectors
data_vecs = get_avg_feature_vecs(sentences, model, num_features)

In [None]:
data_vecs

array([[-0.15448783,  0.11440889,  0.01428595, ..., -0.09813878,
         0.02429086,  0.02707865],
       [-0.10925192,  0.07789978,  0.01502175, ..., -0.07376174,
         0.01908051,  0.01755406],
       [-0.18403938,  0.13830374,  0.01962229, ..., -0.12113741,
         0.02688454,  0.04041118],
       ...,
       [-0.14426686,  0.11323059,  0.02144067, ..., -0.09570388,
         0.02024932,  0.02946019],
       [-0.01349925,  0.01572196, -0.00301715, ..., -0.01204671,
         0.00750873,  0.00751788],
       [-0.076717  ,  0.05523541,  0.01115435, ..., -0.05374796,
         0.01068009,  0.01099399]], dtype=float32)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_vecs, df.label_num, test_size=0.2)

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = GaussianNB()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.25      0.35       377
           1       0.39      0.48      0.43       311
           2       0.35      0.52      0.42       286

    accuracy                           0.41       974
   macro avg       0.44      0.42      0.40       974
weighted avg       0.46      0.41      0.40       974



In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.49      0.51       377
           1       0.45      0.62      0.52       311
           2       0.48      0.35      0.40       286

    accuracy                           0.49       974
   macro avg       0.49      0.49      0.48       974
weighted avg       0.50      0.49      0.48       974



In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel='poly', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.35      0.42       377
           1       0.42      0.72      0.53       311
           2       0.40      0.27      0.32       286

    accuracy                           0.44       974
   macro avg       0.45      0.45      0.42       974
weighted avg       0.46      0.44      0.43       974



In [None]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.52      0.48       377
           1       0.41      0.61      0.49       311
           2       0.46      0.11      0.18       286

    accuracy                           0.43       974
   macro avg       0.44      0.42      0.38       974
weighted avg       0.44      0.43      0.40       974



In [None]:
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.40      0.44       377
           1       0.36      0.73      0.48       311
           2       0.53      0.08      0.15       286

    accuracy                           0.41       974
   macro avg       0.46      0.40      0.36       974
weighted avg       0.46      0.41      0.37       974



In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(scaled_train_embed, y_train)

In [None]:
y_pred = knn.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.46      0.48       377
           1       0.41      0.58      0.48       311
           2       0.39      0.27      0.32       286

    accuracy                           0.44       974
   macro avg       0.44      0.44      0.43       974
weighted avg       0.44      0.44      0.43       974



**Observations**:



*   Gensim encoding gave less accuracy than BOW, TFIDF comparatively

*   Random forest classifier gave 0.49 accuracy.
*   Support vector classifier with linear kernel gave maximum accuracy of 0.43.
*   Accuracy with Naive bayes is 0.41 , with SVM poly kernel is 0.44, with SVM rbf kernel is 0.41, with knn is 0.44