<a href="https://colab.research.google.com/github/JapiKredi/word2vec_movie_review/blob/main/Classification_using_word2vect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Train word2vect model


In [None]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('/content/movie_review.csv', encoding='latin-1')
messages = messages[["text","tag"]]
messages.head()

Unnamed: 0,text,tag
0,"films adapted from comic books have had plenty of success , whether they're about superheroes ( ...",pos
1,"for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a...",pos
2,to say moore and campbell thoroughly researched the subject of jack the ripper would be like say...,pos
3,"the book ( or "" graphic novel , "" if you will ) is over 500 pages long and includes nearly 30 mo...",pos
4,"in other words , don't dismiss this film because of its source .",pos


In [None]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,text,tag,text_clean
0,"films adapted from comic books have had plenty of success , whether they're about superheroes ( ...",pos,"[films, adapted, from, comic, books, have, had, plenty, of, success, whether, they, re, about, s..."
1,"for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a...",pos,"[for, starters, it, was, created, by, alan, moore, and, eddie, campbell, who, brought, the, medi..."
2,to say moore and campbell thoroughly researched the subject of jack the ripper would be like say...,pos,"[to, say, moore, and, campbell, thoroughly, researched, the, subject, of, jack, the, ripper, wou..."
3,"the book ( or "" graphic novel , "" if you will ) is over 500 pages long and includes nearly 30 mo...",pos,"[the, book, or, graphic, novel, if, you, will, is, over, pages, long, and, includes, nearly, mor..."
4,"in other words , don't dismiss this film because of its source .",pos,"[in, other, words, don, dismiss, this, film, because, of, its, source]"


In [None]:
messages["tag"].value_counts()

pos    32937
neg    31783
Name: tag, dtype: int64

In [None]:
messages['tag']=messages['tag'].map({'pos':1,'neg':0})

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['tag'], test_size=0.2)

In [None]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [None]:
#  it represents all of the words that our Word2Vec model learned a vector for.
# Or put another way, it's all of the words that appeared in the training data at least twice. So you can exp
w2v_model.wv.index_to_key

In [None]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('master', 0.9348936676979065),
 ('captain', 0.9299657940864563),
 ('british', 0.9282823801040649),
 ('english', 0.9257248640060425),
 ('george', 0.924160897731781),
 ('jeff', 0.9209091067314148),
 ('shelmikedmu', 0.9206836819648743),
 ('patrick', 0.9201596975326538),
 ('ray', 0.9201411604881287),
 ('kennedy', 0.9182192087173462)]

In [None]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [None]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

In [None]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
9 100
9 100
20 100
44 100
18 100
16 100
5 100
39 100
2 100
21 100
26 100
6 100
24 100
30 100
16 100
35 100
31 100
18 100
25 100
48 100
25 100
22 100
17 100
15 100
36 100
21 100
26 100
40 100
20 100
12 100
10 100
20 100
42 100
8 100
37 100
54 100
8 100
24 100
15 100
5 100
14 100
33 100
4 100
15 100
33 100
38 100
14 100
16 100
44 100
15 100
15 100
7 100
3 100
64 100
13 100
41 100
21 100
8 100
17 100
18 100
12 100
27 100
11 100
11 100
28 100
38 100
19 100
20 100
6 100
25 100
7 100
22 100
4 100
15 100
13 100
27 100
13 100
43 100
22 100
14 100
40 100
13 100
9 100
0 100
5 100
41 100
28 100
11 100
6 100
15 100
15 100
10 100
26 100
7 100
21 100
25 100
41 100
5 100
21 100
5 100
19 100
15 100
9 100
22 100
20 100
23 100
20 100
45 100
20 100
45 100
13 100
37 100
1 100
28 100
4 100
18 100
13 100
23 100
26 100
22 100
29 100
25 100
17 100
28 100
34 100
28 100
35 100
15 100
34 100
6 100
16 100
26 100
14 100
20 100
7 100
16 100
6 100
2 10

## Fit RandomForestClassifier On Top Of Word Vectors

In [None]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr_model = lr.fit(X_train_vect_avg, y_train.values.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Use the trained model to make predictions on the test data
y_pred = lr_model.predict(X_test_vect_avg)

In [None]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.563 / Recall: 0.66 / Accuracy: 0.57


In [None]:
import transformers
import torch
# Load the pre-trained sentence embedding model
model_name = 'bert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModel.from_pretrained(model_name)
# Define a function to compute the sentence embedding
def get_sentence_embedding(sentence):
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    with torch.no_grad():
        output = model(input_ids)
    embeddings = output.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()


In [None]:
embeddings = [get_sentence_embedding(review) for review in messages['text']]

In [None]:
# Compute the sentence embeddings for each review
X_train, X_test, y_train, y_test = train_test_split(embeddings,
                                                    messages['tag'], test_size=0.2)

# Train a logistic regression model to predict the sentiment score
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
# Use the model to predict the sentiment of new reviews
new_reviews = ['This movie was great!', 'This movie was terrible!']
new_embeddings = [get_sentence_embedding(review) for review in new_reviews]
X_new = pd.DataFrame(new_embeddings)
predicted = model.predict(X_new)
print(predicted)