# CBOW and Skip gram encoding

Applying CBOW encoding to the dataset and perform different Machine Learning Models on the dataset.


*  The CBOW architecture includes a deep learning classification model that uses context words as input (X) to predict our target word, Y. Considering the following scenario: Have a wonderful day.


*   Let the word "excellent" be the input to the Neural Network. It's important to note that we're attempting to predict a target word (day) from a single context input word, unique. More specifically, we compare the output error of the one-hot encoding of the input word to the one-hot encoding of the target word (day)
.

*   The context words are predicted in the skip-gram model given a target (center) word. Consider the following sentence: "Word2Vec uses a deep learning model in the backend." Given the center word 'learning' and a context window size of 2, the model tries to predict ['deep,' 'model'], and so on.

In [None]:
# For Data Preprocessing
import pandas as pd

# Gensim Libraries
import gensim
from gensim.models import Word2Vec,KeyedVectors

# For visualization of word2vec model
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("/content/drive/MyDrive/nlp/spacy_preprocessed_labeledtext.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,File Name,Caption,LABEL
0,0,1.txt,feel today legday jelly ache gym,negative
1,1,10.txt,absolute disgrace carriage Bangor half way sta...,negative
2,2,100.txt,Valentine 1 nephew elated little thing big goo...,positive
3,3,1000.txt,betterfeelingfilm RT Instagram day film powerl...,neutral
4,4,1001.txt,Zoe love rattle,positive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df['label_num'] = df['LABEL'].map({'neutral' : 0, 'positive': 1,'negative':2})

In [None]:
df['Caption']= df['Caption'].astype(str)

# Train CBOW Word2Vec Model



In [None]:
sentences = []
for review in df.Caption.values:
    sentences.append(review.split())

In [None]:
num_features=100
model_cbow = Word2Vec(sentences, sg=0, min_count=10, workers=4, window =3, epochs = 20, vector_size=num_features)

In [None]:
import numpy as np
def make_feature_vec(words, model,num_features):
    # Function to average all of the word vectors in a given paragraph
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in model.wv.key_to_index:
            feature_vec = np.add(feature_vec, model.wv.get_vector(word))
            nwords += 1
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

def get_avg_feature_vecs(reviews, model, num_features):
    # Function to generate vectors for all movie reviews in a dataset
    counter = 0
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter += 1
    return review_feature_vecs

# Convert the training and test data into fixed-length feature vectors
data_vecs = get_avg_feature_vecs(sentences, model_cbow, num_features)

In [None]:
model_cbow.wv.most_similar("good")

[('big', 0.9955771565437317),
 ('girl', 0.9953746795654297),
 ('energetic', 0.9949219822883606),
 ('week', 0.9948364496231079),
 ('guy', 0.9947118759155273),
 ('man', 0.994564950466156),
 ('today', 0.9945285320281982),
 ('friend', 0.994317352771759),
 ('get', 0.9942103624343872),
 ('tell', 0.9937557578086853)]

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.key_to_index:

        tokens.append(model.wv[word])
        labels.append(word)

    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_vecs, df.label_num, test_size=0.2)

Training ml models

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = GaussianNB()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.18      0.28       343
           1       0.42      0.09      0.15       350
           2       0.32      0.90      0.47       281

    accuracy                           0.36       974
   macro avg       0.46      0.39      0.30       974
weighted avg       0.46      0.36      0.29       974



In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.63      0.59       343
           1       0.63      0.62      0.63       350
           2       0.56      0.48      0.52       281

    accuracy                           0.58       974
   macro avg       0.58      0.58      0.58       974
weighted avg       0.58      0.58      0.58       974



In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel='poly', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.61      0.56       343
           1       0.66      0.53      0.59       350
           2       0.49      0.51      0.50       281

    accuracy                           0.55       974
   macro avg       0.56      0.55      0.55       974
weighted avg       0.56      0.55      0.55       974



In [None]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.83      0.56       343
           1       0.67      0.46      0.54       350
           2       0.55      0.12      0.20       281

    accuracy                           0.49       974
   macro avg       0.55      0.47      0.44       974
weighted avg       0.55      0.49      0.45       974



In [None]:
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.84      0.57       343
           1       0.64      0.48      0.55       350
           2       0.61      0.07      0.13       281

    accuracy                           0.49       974
   macro avg       0.56      0.47      0.41       974
weighted avg       0.55      0.49      0.43       974



In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(scaled_train_embed, y_train)

In [None]:
y_pred = knn.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.52      0.51       343
           1       0.57      0.55      0.56       350
           2       0.41      0.41      0.41       281

    accuracy                           0.50       974
   macro avg       0.49      0.49      0.49       974
weighted avg       0.50      0.50      0.50       974



**Observations**:

*   Random forest classifier gave 0.58 accuracy.
*   Support vector classifier with linear kernel gave accuracy of 0.55 .
*   Accuracy with Naive bayes is 0.36 , with SVM poly kernel is 0.49, with SVM rbf kernel is 0.49, with knn is 0.50

# Train Skip-Gram Word2Vec Model

In [None]:
num_features=100
model_skipgram = Word2Vec(sentences, sg=1, min_count=10, workers=4, window =3, epochs = 20, vector_size=num_features)

In [None]:
model_skipgram.wv.most_similar("good")

[('friend', 0.9679284691810608),
 ('girl', 0.9649432301521301),
 ('big', 0.964033305644989),
 ('energetic', 0.9636240005493164),
 ('man', 0.9571497440338135),
 ('baby', 0.9571191668510437),
 ('awesome', 0.9563152194023132),
 ('look', 0.9561206698417664),
 ('make', 0.955333411693573),
 ('work', 0.9525425434112549)]

In [None]:
import numpy as np

In [None]:
import numpy as np
def make_feature_vec(words, model,num_features):
    # Function to average all of the word vectors in a given paragraph
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in model.wv.key_to_index:
            feature_vec = np.add(feature_vec, model.wv.get_vector(word))
            nwords += 1
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

def get_avg_feature_vecs(reviews, model, num_features):
    # Function to generate vectors for all movie reviews in a dataset
    counter = 0
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter += 1
    return review_feature_vecs

# Convert the training and test data into fixed-length feature vectors
data_vecs_sg = get_avg_feature_vecs(sentences, model_cbow, num_features)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_vecs_sg, df.label_num, test_size=0.2)

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = GaussianNB()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.14      0.24       367
           1       0.35      0.09      0.14       308
           2       0.34      0.94      0.50       299

    accuracy                           0.37       974
   macro avg       0.46      0.39      0.29       974
weighted avg       0.48      0.37      0.29       974



In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.60      0.59       367
           1       0.54      0.64      0.59       308
           2       0.57      0.43      0.49       299

    accuracy                           0.56       974
   macro avg       0.56      0.56      0.56       974
weighted avg       0.56      0.56      0.56       974



In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel='poly', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.61      0.57       367
           1       0.55      0.56      0.55       308
           2       0.52      0.42      0.47       299

    accuracy                           0.53       974
   macro avg       0.53      0.53      0.53       974
weighted avg       0.53      0.53      0.53       974



In [None]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.75      0.58       367
           1       0.52      0.56      0.54       308
           2       0.66      0.12      0.20       299

    accuracy                           0.50       974
   macro avg       0.55      0.48      0.44       974
weighted avg       0.54      0.50      0.45       974



In [None]:
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.68      0.57       367
           1       0.45      0.64      0.53       308
           2       0.81      0.07      0.13       299

    accuracy                           0.48       974
   macro avg       0.58      0.46      0.41       974
weighted avg       0.57      0.48      0.42       974



In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(scaled_train_embed, y_train)

In [None]:
y_pred = knn.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.59      0.54       367
           1       0.48      0.56      0.52       308
           2       0.51      0.33      0.40       299

    accuracy                           0.50       974
   macro avg       0.50      0.49      0.49       974
weighted avg       0.50      0.50      0.49       974



**Observations**:

*   Random forest classifier gave 0.56 accuracy.
*   Support vector classifier with linear kernel gave accuracy of 0.53 .
*   Accuracy with Naive bayes is 0.37 , with SVM poly kernel is 0.50, with SVM rbf kernel is 0.48, with knn is 0.50