# This project aims to implement simple NLP task of Word2Vec

**Steps**
1. Importing the data
2. Data preprocessing
3. Modeling
4. Training
5. Examples


In [3]:
!pip install gensim
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.1 (from python-Levenshtein)
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)
  Downloading rapidfuzz-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.1 python-Levenshtein-0.25.1 rapidfuzz-3.9.4


In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gensim

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
df = pd.read_json('/content/drive/MyDrive/NLP-Word2Vec/reviews_Sports_and_Outdoors_5.json', lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [9]:
df.shape

(296337, 9)

#Preprocessing and Tokenization

For NLP it is usually we convert all the words into lowercase, split them, remove the punctuations and unnecessary spaces

In [10]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [11]:
review_text[0]

['this',
 'came',
 'in',
 'on',
 'time',
 'and',
 'am',
 'veru',
 'happy',
 'with',
 'it',
 'haved',
 'used',
 'it',
 'already',
 'and',
 'it',
 'makes',
 'taking',
 'out',
 'the',
 'pins',
 'in',
 'my',
 'glock',
 'very',
 'easy']

In [12]:
df.reviewText.loc[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

**Above we confirmed that our customer comments were just splitted into separate and lowercased words**

#Now, it's time to train the Word2Vec Model

In [13]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [14]:
model.build_vocab(review_text, progress_per=1000)

In [15]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91339652, 121496535)

**Saving the model for further using**

In [16]:
model.save("./word2vec-reviews_Sports_and_Outdoors_5.model")

**Let's now check how our model works to find the similarities between the words**

In [17]:
model.wv.most_similar('good')

[('decent', 0.8653570413589478),
 ('great', 0.7789862751960754),
 ('nice', 0.7312660217285156),
 ('fantastic', 0.6986916065216064),
 ('excellent', 0.6245532035827637),
 ('terrific', 0.6245023608207703),
 ('reasonable', 0.5997759699821472),
 ('awesome', 0.5843641757965088),
 ('outstanding', 0.5733535885810852),
 ('wonderful', 0.5729143023490906)]

In [18]:
model.wv.most_similar('bad')

[('horrible', 0.6774658560752869),
 ('terrible', 0.6755551695823669),
 ('shabby', 0.6435626745223999),
 ('funny', 0.5697972178459167),
 ('upset', 0.5354256629943848),
 ('good', 0.5232346653938293),
 ('poor', 0.5149167776107788),
 ('greatest', 0.4909208416938782),
 ('awful', 0.4909031391143799),
 ('stupid', 0.4882829189300537)]

In [20]:
model.wv.similarity(w1="great", w2="excellent")

0.6697843

#Conclusion

As we can see computers can understand words if we prepare the text in an numeric format