In [9]:
from gensim.models.word2vec import Word2Vec

In [18]:
import re
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
import nltk
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def preprocess(text):
  text = text.lower() #lowercase
  text = re.sub(r'[^\w\s]', '', text) #remove punctuations
  text = re.sub(r'\d+', '', text) #remove numbers
  text = " ".join(text.split()) #stripWhitespace
  text = text.split()
  text = [x for x in text if x not in stop_words] #remove stopwords
  text = [x for x in text if x not in ["product", "skin","products"]] #remove task specific stopwords
  text = " ".join(text)
  #stemmer_ps = PorterStemmer()
  #text = [stemmer_ps.stem(word) for word in text.split()] #stemming
  #text = " ".join(text)
  lemmatizer = WordNetLemmatizer()
  text = [lemmatizer.lemmatize(word) for word in text.split()]  #lemmatization
  text = " ".join(text)
  return(text)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [19]:
import pandas as pd
data = pd.read_csv('Ulta Skincare Reviews.csv', encoding='utf-8')

In [20]:
data.head()


Unnamed: 0,Review_Title,Review_Text,Verified_Buyer,Review_Date,Review_Location,Review_Upvotes,Review_Downvotes,Product,Brand,Scrape_Date
0,Perfect,Love using this on my face while in the shower...,No,15 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
1,You need this,Even better than the daily microfoliant. I'm o...,No,27 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
2,Clean skin,Enjoy this product so much ! I look forward to...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
3,Love This Stuff!,I've never tried anything like this before and...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23
4,This exfoliates very nicely and,This exfoliates very nicely and gives a very s...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23


In [21]:
data = data.dropna(subset=['Review_Text'])
data['review_processed']=data['Review_Text'].apply(lambda x:preprocess(x))
data['review_processed']=data['review_processed'].apply(lambda x:x.split())


In [22]:
data.head()

Unnamed: 0,Review_Title,Review_Text,Verified_Buyer,Review_Date,Review_Location,Review_Upvotes,Review_Downvotes,Product,Brand,Scrape_Date,review_processed
0,Perfect,Love using this on my face while in the shower...,No,15 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23,"[love, using, face, shower, heat, give, light,..."
1,You need this,Even better than the daily microfoliant. I'm o...,No,27 days ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23,"[better, daily, microfoliant, im, obsessed, sm..."
2,Clean skin,Enjoy this product so much ! I look forward to...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23,"[enjoy, look, forward, using, really, feel, gr..."
3,Love This Stuff!,I've never tried anything like this before and...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23,"[ive, tried, like, love, apply, face, little, ..."
4,This exfoliates very nicely and,This exfoliates very nicely and gives a very s...,No,2 months ago,Undisclosed,0,0,Multi-Vitamin Thermafoliant,Dermalogica,3/27/23,"[exfoliates, nicely, give, smooth, irritation,..."


In [23]:
model = Word2Vec(sentences=data['review_processed'].tolist(), vector_size=100, sg=1,min_count=5,window=5,workers=50,seed=10,epochs=50)
#vector_size (int, optional) – Dimensionality of the word vectors.
#min_count (int, optional) – Ignores all words with total frequency lower than this.
#window (int, optional) – Maximum distance between the current and predicted word within a sentence.
#workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
#sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
#epochs (int, optional) – Number of iterations (epochs) over the corpus. (Formerly: iter)

#The meaning of most of the parameters are beyond the scope of this class. If interested, please check the official documentations: https://radimrehurek.com/gensim/models/word2vec.html

In [24]:
model.save('w2v_dr.w2v')

model=Word2Vec.load('w2v_dr.w2v')
vocab = model.wv.index_to_key
len(vocab)

1375

In [39]:
outdata=pd.DataFrame(model.wv.vectors)
outdata.to_csv('word2vec_ratemds.tsv',sep='\t',index=False,header=False)
pd.DataFrame(model.wv.index_to_key).to_csv('word2vec_ratemds_words.tsv',sep='\t',index=False,header=False)

In [25]:
model.wv.most_similar('smooth', topn=10)

[('soft', 0.6513921022415161),
 ('feeling', 0.5631524324417114),
 ('leaf', 0.5416674613952637),
 ('rejuvenated', 0.5048118829727173),
 ('clean', 0.4919344186782837),
 ('looking', 0.48878416419029236),
 ('great', 0.4862741529941559),
 ('renewed', 0.4746168553829193),
 ('glowy', 0.4729959964752197),
 ('lathered', 0.4642159044742584)]

In [27]:
model.wv.most_similar('cheap', topn=10)

[('doubt', 0.4936646819114685),
 ('pricey', 0.46230578422546387),
 ('hassle', 0.45153021812438965),
 ('recommendation', 0.4510311484336853),
 ('replacement', 0.45067599415779114),
 ('pay', 0.4475680887699127),
 ('sell', 0.44373080134391785),
 ('overpriced', 0.4414990246295929),
 ('drug', 0.4393978416919708),
 ('tag', 0.43888795375823975)]

In [28]:
model.wv.most_similar('herbal', topn=10)

[('divine', 0.7995284199714661),
 ('citrus', 0.6289437413215637),
 ('lush', 0.568353533744812),
 ('smelling', 0.5432272553443909),
 ('food', 0.5343807935714722),
 ('tea', 0.53052818775177),
 ('allergic', 0.5271401405334473),
 ('grade', 0.5247587561607361),
 ('hair', 0.5072873830795288),
 ('tree', 0.494004487991333)]

In [29]:
model.wv.most_similar('smell', topn=10)

[('reminds', 0.5236154198646545),
 ('sphere', 0.5113282203674316),
 ('pleasant', 0.48257118463516235),
 ('overpowering', 0.47820329666137695),
 ('smelling', 0.4765167236328125),
 ('kinda', 0.4749721884727478),
 ('odd', 0.455933153629303),
 ('sticking', 0.4348635971546173),
 ('citrus', 0.4205692410469055),
 ('giving', 0.4114507734775543)]

In [30]:
model.wv.most_similar('acne', topn=10)

[('hormonal', 0.6075305342674255),
 ('struggling', 0.564468264579773),
 ('adult', 0.5638186931610107),
 ('prone', 0.5598602890968323),
 ('scar', 0.5506152510643005),
 ('suffered', 0.5022419095039368),
 ('dealing', 0.49983787536621094),
 ('uneven', 0.49755778908729553),
 ('lighten', 0.4956566095352173),
 ('cystic', 0.4913373291492462)]

In [31]:
model.wv.most_similar('dry', topn=10)

[('zone', 0.5335835218429565),
 ('summer', 0.5229618549346924),
 ('tends', 0.5204784870147705),
 ('flakey', 0.5178025960922241),
 ('cold', 0.48047029972076416),
 ('cleanser', 0.47505369782447815),
 ('patch', 0.4653399884700775),
 ('oily', 0.4635457694530487),
 ('fall', 0.46009233593940735),
 ('t', 0.45917120575904846)]

In [32]:
v_time = model.wv['time']

In [33]:
v_time

array([-2.01928258e-01,  2.54762381e-01,  4.22765464e-02,  3.78485650e-01,
        3.25205624e-01, -1.85549408e-01, -8.09025392e-02, -5.41005284e-03,
       -3.91937554e-01, -2.23091687e-03,  2.37277105e-01,  5.93509339e-02,
        2.94198468e-02,  4.12206531e-01,  3.64330381e-01, -8.29424486e-02,
        1.41016856e-01,  6.32244647e-01, -5.88633493e-02,  1.54567540e-01,
       -1.04692444e-01,  4.08070087e-02, -2.26057783e-01,  1.91051548e-03,
       -8.32311749e-01, -7.66872019e-02, -3.71907162e-03,  6.04426824e-02,
       -3.26727420e-01,  2.84893900e-01, -1.74100529e-02, -2.11853132e-01,
        1.07992329e-01, -1.18060283e-01, -1.25883028e-01,  4.45427597e-01,
       -1.57202065e-01,  1.78880155e-01, -3.02102417e-01,  4.47809964e-01,
       -2.58425742e-01,  7.01247305e-02, -6.16229028e-02, -1.95707247e-01,
       -3.34783253e-04,  2.71953672e-01,  4.74596947e-01, -4.61636424e-01,
        3.50323953e-02, -1.40656024e-01,  2.41618946e-01,  3.15565407e-01,
       -3.10537089e-02, -

In [60]:
model.wv.similarity('acne', 'smooth')

0.18292816

In [58]:
v_acne = model.wv['acne']
v_smooth= model.wv['smooth']
import numpy
numpy.dot(v_acne, v_smooth)/(numpy.linalg.norm(v_acne)* numpy.linalg.norm(v_smooth))


0.18292814

In [59]:
v_acne = model.wv['acne']
v_scar= model.wv['scar']
import numpy
numpy.dot(v_acne, v_scar)/(numpy.linalg.norm(v_acne)* numpy.linalg.norm(v_scar))

0.55061525

In [70]:
v_acne = model.wv['acne']
v_healthy= model.wv['healthy']
numpy.dot(v_acne, v_healthy)/(numpy.linalg.norm(v_acne)* numpy.linalg.norm(v_healthy))

0.021453235

In [57]:
v_acne = model.wv['acne']
v_smooth = model.wv['smooth']
v_salicylic = model.wv['salicylic']
v_scar = model.wv['scar']
created_smooth = v_acne + v_salicylic
created_scar = v_acne + v_salicylic
numpy.dot(created_smooth, v_smooth)/(numpy.linalg.norm(created_smooth)* numpy.linalg.norm( v_smooth))


0.21828108

In [56]:
numpy.dot(created_scar, v_scar)/(numpy.linalg.norm(created_scar)* numpy.linalg.norm( v_scar))

0.53010076

In [72]:
created_healthy = v_acne + v_salicylic
numpy.dot(created_healthy, v_healthy)/(numpy.linalg.norm(created_healthy)* numpy.linalg.norm( v_healthy))

0.106446415

In [61]:
v_acne = model.wv['acne']
v_smooth = model.wv['smooth']
v_vitamin = model.wv['vitamin']
v_scar = model.wv['scar']
created_smooth = v_acne + v_vitamin

numpy.dot(created_smooth, v_smooth)/(numpy.linalg.norm(created_smooth)* numpy.linalg.norm( v_smooth))


0.2952657

In [73]:
created_healthy = v_acne + v_vitamin
numpy.dot(created_healthy, v_healthy)/(numpy.linalg.norm(created_healthy)* numpy.linalg.norm( v_healthy))

0.06563923

In [62]:
created_scar = v_acne + v_vitamin
numpy.dot(created_scar, v_scar)/(numpy.linalg.norm(created_scar)* numpy.linalg.norm( v_scar))

0.47145757

In [63]:
v_acne = model.wv['acne']
v_smooth = model.wv['smooth']
v_lactic = model.wv['lactic']
v_scar = model.wv['scar']
v_healthy = model.wv['healthy']
created_smooth = v_acne + v_lactic

numpy.dot(created_smooth, v_smooth)/(numpy.linalg.norm(created_smooth)* numpy.linalg.norm( v_smooth))

0.21564613

In [71]:
created_healthy = v_acne + v_lactic
numpy.dot(created_healthy, v_healthy)/(numpy.linalg.norm(created_healthy)* numpy.linalg.norm( v_healthy))

0.05534291

In [77]:
created_scar = v_acne + v_lactic
numpy.dot(created_scar, v_scar)/(numpy.linalg.norm(created_scar)* numpy.linalg.norm( v_scar))

0.4666556

In [67]:
v_acne = model.wv['acne']
v_smooth = model.wv['smooth']
v_herbal = model.wv['herbal']
v_scar = model.wv['scar']
v_healthy = model.wv['healthy']
created_smooth = v_acne + v_herbal

numpy.dot(created_smooth, v_smooth)/(numpy.linalg.norm(created_smooth)* numpy.linalg.norm( v_smooth))


0.23659256

In [68]:

created_scar = v_acne + v_herbal
numpy.dot(created_scar, v_scar)/(numpy.linalg.norm(created_scar)* numpy.linalg.norm( v_scar))

0.37556463

In [69]:
created_healthy = v_acne + v_herbal
numpy.dot(created_healthy, v_healthy)/(numpy.linalg.norm(created_healthy)* numpy.linalg.norm( v_healthy))


0.18950157