<a href="https://colab.research.google.com/github/Neetika23/Machine-Learning/blob/master/Amazon_Food_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Amazon Fine Food review Analysis

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Objective:** Given a review, tell whether a review is positive or not? This can be done with the help of score/rating. But we will not keep this score column and will predict the review without using the score column, because if we use score, it can be done easily with if-else statement. ***Text*** is the most important piece of information here

In [2]:
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve,auc
from nltk.stem.porter import PorterStemmer

  import pandas.util.testing as tm


In [3]:
# Opens connection to database
con = sqlite3.connect('/content/drive/My Drive/amazon_fine_food/database.sqlite')

In [4]:
# filtering only positive and negative review i.e not taking into consideration the review
# which are neutral, score=3.
filter_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""", con)
# Run this sql command using con (connection)

In [5]:
# Give score > 3 as positive and score < 3 as negative
# Replacing the values of score col as positive or negative.
def partition(x):
  if x < 3:
    return 'negative'
  return 'positive'

# Changing reviews with score to be positive and negative.
actualScore = filter_data['Score']
positiveNegative = actualScore.map(partition) # Using function to replace data of score column
filter_data['Score'] = positiveNegative

In [6]:
filter_data.shape
filter_data.head()

# Time is stored as a unix time stamp.

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


##Data Cleaning: Deduplication

The reviews data has many duplicate entries. Hence it was necessary to remove them in order to get unbiased results for the analysis of the data.

In [7]:
# In the below query we sort the reviews given by the user of that particular user id, sorting is done based on productid.
display = pd.read_sql_query("""
SELECT * 
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductId
""",con)
display

# It can be seen that except product id all the data is same. How can a user reviews more than one product at a particular time stamp.
# To look for a product in amazon with a particular product id, use amazon.com/dp/<product_id> (ASIN- Amazon Standard Identification No.)
# The things was that it was a product with different flavours, so the user gave a common review and the it got stored with different
# product id.
# Product review got shared for other as the other was very similar to the particular product.
# These data does not add value to dataset.

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [8]:
# Lets remove them.
# Sorting the data according to the product_id in ascending order.
sort_data = filter_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [9]:
# Duplicates can be removed if something is common, so in this case, a data will be considered as the duplicate,
# when userid,profilename,time and text will be same. We will keep the first data point (productid) and delete the rest of the duplicate data points.
# inplace = false means it will return a (copy) value for you, instead to drop duplicates in place.
final = sort_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first', inplace=False)
final.shape

(364173, 10)

In [10]:
# Checking how much % of data still remains after removing duplicates
(final['Id'].size*1.0/filter_data['Id'].size*1.0)*100

#69% of data remaining after this cleaning.

69.25890143662969

In [11]:
# Now there is another problem with data.
# Helpfulness Numerator which is how many people said Yes to the review should be less than Helpfullness Denominator, which is
# how many people said Yes + how many people said No.
# But we find that in some data points HN > HD, which is an error, so this needs to be removed.
display = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id = 44737 OR Id = 64422
ORDER BY ProductId
""",con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [12]:
# So keep those data points where HN <= HD
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [13]:
print(final.shape)

#How many positive and negative reviews are there in pur dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

##Bag-of-Words

In [14]:
#BoW
count_vec = CountVectorizer()
# We have count Vectorizer in sckit learn

# To compute BoW we just have to do as follows
final_counts = count_vec.fit_transform(final['Text'].values)
# Get the text col from final df and convert them into values. 

In [15]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [16]:
final_counts.get_shape()
# We get matrix with the printed rows and columns.
# Each row corresponds to the review and each col (dimension) corresponds to the unique words.
# Dimensionality of this vector is 115281.

(364171, 115281)

##Text-preprocessing
1. Remove html tags.
2. Remove punctuation marks.
3. Check for alpha-numeric.
4. Check if len > 2.
5. Convert to lowercase.
6. Remove stopwords.
7. Stemming


In [17]:
# find sentences containing html tags
import re # regular expression
i=0;
for sent in final['Text'].values:
  if(len(re.findall('<.*?>',sent))):
    print(i)
    print(sent)
    break;
  i += 1;


6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
# Link: https://pymotw.com/2/re/
# nltk : natural lang processing toolkit
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) # set of all stopwords stored in stop
# Initializing stemmer, also specifying the lang as english because it can also be done for non-english lang.
sno = nltk.stem.SnowballStemmer('english')

# Given any sentence, it will find all the text in < > and replace this it one space.
def cleanhtml(sentence):
  cleanr = re.compile('<.*?>')  # find any text included in < >, here .*? means any text.(check out documentation)
  cleantext = re.sub(cleanr, ' ', sentence) # sub: substitute , here means that substitute with space
  return cleantext

def cleanpunc(sentence):
  cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence) # | means logical OR, we replace ?!\'"3 with space
  cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
  return cleaned

print(stop) # see list of stopwords
print('***********************************************************')
print(sno.stem('tasty')) # find the stem word of tasty. Its "tasti".

{'ain', 'her', "wouldn't", 'were', "you're", 's', 'doesn', 'myself', 'too', 'was', 'themselves', 'our', "wasn't", 'yourselves', 'not', 'being', "should've", 'out', "doesn't", 'isn', "you'll", 'during', "shan't", 'nor', 'with', 'both', 'ourselves', 've', 'wouldn', 'hers', 'such', 'an', 'they', 'then', 'in', 'just', 'while', 'into', 'why', "you'd", "aren't", 'y', 'ma', 'yourself', 'through', 'other', 'mustn', 'you', 'me', 'what', "she's", 'above', 'will', 'don', "haven't", 'down', "you've", "won't", 'are', 'after', 'wasn', 'as', 'about', 'ours', 'who', 'very', "weren't", 'these', "that'll", 'having', 'or', 'did', 'between', 'haven', 'when', 'can', 'this', 'd', "couldn't", 'and', 'all', 'few', 'the', 'against', "hadn't", 'my', 'any', 'own', 'your', 'which', 'has', 'most', 'if', 'now', 'at', 'his', "don't", 'do', 'him', 'mightn', 'those', 'she', 'herself', 'under', 'had', 'itself', 'whom', 'should', 'until', 'he', 'some', 'theirs', "needn't", 'aren', 'so', "mightn't", 'over', 'for', "mustn

In [20]:
i = 0
str1 = ' '
# Creating a list of final strings after all of the processings
final_string = []
all_positive_words = [] # store words from +ve reviews here, so as process them later.
all_negative_words = [] # # store words from -ve reviews here, so as process them later.
s = ''

for sent in final['Text'].values: # for each sentence in final text value.
  filtered_sentence=[]
  sent = cleanhtml(sent) # clear all html tags.
  for w in sent.split(): # splitting the sentence into words.
    for cleaned_words in cleanpunc(w).split(): # clearing the punctuations from each words.
      if((cleaned_words.isalpha()) & (len(cleaned_words)>2)): # checking if it alpha-numeric and length > 2.
        if(cleaned_words.lower() not in stop): # converting it to lower case.
           s = (sno.stem(cleaned_words.lower())).encode('utf8') # doing the stemming process.
           filtered_sentence.append(s)  # storing the final words after passing it through all the conditions.
           if(final['Score'].values)[i] == 'positive':
             all_positive_words.append(s) # list of all positive words.
           if(final['Score'].values)[i] == 'negative':
             all_negative_words.append(s) # list of all negative words.
        else:
          continue
      else:
        continue
  # Join the filtered sentences to construct the final sentence.
  str1 = b" ".join(filtered_sentence)  # final string of cleaned words.

  final_string.append(str1)
  i+=1

In [24]:
# Adding another new column of CleanedText after doing all the processing.
final['CleanedText'] = final_string


In [25]:
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...


In [26]:
# Storing final table into a sqlite table for future, so that I dont need to keep repeating this preprocessing
conn = sqlite3.connect('/content/drive/My Drive/amazon_fine_food/final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema = None, if_exists = 'replace')

##Bi-Grams and n-Grams

In [27]:
# Let's see which word occur more often in my positive review and negative reviews.
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print("Most Common Positive Words :", freq_dist_positive.most_common(20))
print("Most Common Negative Words :", freq_dist_negative.most_common(20))

# Output is printed as words with their frequency.

Most Common Positive Words : [(b'like', 139429), (b'tast', 129047), (b'good', 112766), (b'flavor', 109624), (b'love', 107357), (b'use', 103888), (b'great', 103870), (b'one', 96726), (b'product', 91033), (b'tri', 86791), (b'tea', 83888), (b'coffe', 78814), (b'make', 75107), (b'get', 72125), (b'food', 64802), (b'would', 55568), (b'time', 55264), (b'buy', 54198), (b'realli', 52715), (b'eat', 52004)]
Most Common Negative Words : [(b'tast', 34585), (b'like', 32330), (b'product', 28218), (b'one', 20569), (b'flavor', 19575), (b'would', 17972), (b'tri', 17753), (b'use', 15302), (b'good', 15041), (b'coffe', 14716), (b'get', 13786), (b'buy', 13752), (b'order', 12871), (b'food', 12754), (b'dont', 11877), (b'tea', 11665), (b'even', 11085), (b'box', 10844), (b'amazon', 10073), (b'make', 9840)]


From above we see that mose common positive and negative words overlap. For eg: 'like' which could be used as 'not like' etc.
So, its good to consider pairs of consequent words (bi-grams) or n grams.

In [28]:
count_vect = CountVectorizer(ngram_range=(1,2))   # 1 --> unigram and 2 --> bigrams.

# When I used Unigrams I had 115k dimensions and when I used bigrams, the number of dim increases to 2910K (as I have 1gram and 2gram both).
final_bigram_counts = count_vect.fit_transform(final['Text'].values)
final_bigram_counts.get_shape()

(364171, 2910192)

##TF-IDF

In [29]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2)) # It also have ngram param.
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)  # I just gave raw data here.

In [30]:
final_tf_idf.get_shape()  # Get exactly same as above 2910K.

(364171, 2910192)

In [31]:
# If I want to get each of the feature names (here, the list of words, as each word is dim here).
features = tf_idf_vect.get_feature_names()
len(features)

2910192

In [32]:
# Lets try to print some 10 features here.
features[100000:100010]

# These are bigrams, unigrams...

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and']

In [33]:
# Get the vector of review 3.
# Convert a row in sparsematrix to a numpy array
import numpy as np
print(np.array(final_tf_idf[3,:]))

  (0, 2857710)	0.12516284773073685
  (0, 1468762)	0.12516284773073685
  (0, 2579212)	0.07104099313724015
  (0, 1034587)	0.052718834273936616
  (0, 2436412)	0.08226308798444909
  (0, 1034365)	0.12516284773073685
  (0, 458832)	0.14613739881457247
  (0, 1334019)	0.11412869651927751
  (0, 2818673)	0.06679413440460479
  (0, 1394881)	0.07292728198961108
  (0, 869883)	0.07645300926395387
  (0, 2859157)	0.07942414973798338
  (0, 2565801)	0.07272882367484192
  (0, 159362)	0.03756535633266459
  (0, 2598521)	0.0509941884610428
  (0, 1478400)	0.05139544868418497
  (0, 997429)	0.058490230406077925
  (0, 1183590)	0.11467604291955785
  (0, 2542781)	0.09565729858861444
  (0, 1276133)	0.055596165637067714
  (0, 2142988)	0.10979995636542227
  (0, 2650620)	0.12516284773073685
  (0, 2619109)	0.07802654664202313
  (0, 1239571)	0.058975199933590856
  (0, 1795444)	0.08242442423807618
  :	:
  (0, 991870)	0.015558528071919465
  (0, 2743883)	0.04926122937961985
  (0, 2502558)	0.018127513404642612
  (0, 2212820)

In [34]:
def top_tfidf_feats(row, features, top_n=25):  # getting top 25 tfidf
  # Get top tfidf in a row and return them with corresponing ranks\
  # argsort is to sort.
  topn_ids = np.argsort(row)[::-1][:top_n]
  top_feats = [(features[i], row[i]) for i in topn_ids]
  df = pd.DataFrame(top_feats)
  df.columns = ['features', 'tfidf']
  return df

# for vector 1 corresponding to review 1, converting it to numpy array, giving all feature values, and getting top 25.
top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)

In [35]:
top_tfidf
# I am getting the top 25 terms in given review.
# tfidf vector is also a sparse matrix.

Unnamed: 0,features,tfidf
0,sendak books,0.173437
1,rosie movie,0.173437
2,paperbacks seem,0.173437
3,cover version,0.173437
4,these sendak,0.173437
5,the paperbacks,0.173437
6,pages open,0.173437
7,really rosie,0.168074
8,incorporates them,0.168074
9,paperbacks,0.168074


##Word2Vec

In [36]:
# Using Google News W2V.
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [None]:
# in this file, for every word there a vector, its been collected using google news.

In [None]:
model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz')
# This file contain 300D vectors.

In [None]:
model.wv['computer']
# To get the vector for a particular word.

In [None]:
model.wv.similarity('woman','man')
# Finding numeric similarity between words. It ranges for 0 to 1.
# 1 means exactly same.
# 0 means very very diff. It is the min value.

In [None]:
model.vw.most_similar('woman')
# Finding most similar words to woman. It gives output in decreasing order of similarities. Its case-sensitive. 
# Ex: 'woman' and 'Woman' may differ.

In [None]:
# model.wv.most_similar('tasti')
# The above line will generate error as there is no word tasti. So we may get error in some cases of stem words.
model.wv.most_similar('tasty')
# This will give words similar to tasty.
# Similarity values are large if points are closer.

In [None]:
model.wv.similarity('tasty','tast')
# Finding similarity between two words.

In [37]:
# Here we are creating the list of sentences consisting of words.
import gensim
i=0
list_of_sent = []
for sent in final['Text'].values:
  filtered_sentence=[]
  sent=cleanhtml(sent)
  for w in sent.split():
    for cleaned_words in cleanpunc(w).split():
      if(cleaned_words.isalpha()):
        filtered_sentence.append(cleaned_words.lower())
      else:
        continue
  list_of_sent.append(filtered_sentence)

In [38]:
print(final['Text'].values[0])
print("***********************************")
print(list_of_sent[0])

# We that the particular printed sentence is converted into the list.

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
***********************************
['this', 'witty', 'little', 'book', 'makes', 'my', 'son', 'laugh', 'at', 'loud', 'i', 'recite', 'it', 'in', 'the', 'car', 'as', 'were', 'driving', 'along', 'and', 'he', 'always', 'can', 'sing', 'the', 'refrain', 'hes', 'learned', 'about', 'whales', 'india', 'drooping', 'i', 'love', 'all', 'the', 'new', 'words', 'this', 'book', 'introduces', 'and', 'the', 'silliness', 'of', 'it', 'all', 'this', 'is', 'a', 'classic', 'book', 'i', 'am', 'willing', 'to', 'bet', 'my', 'son', 'will', 'still', 'be', 'able', 'to', 'recite', 'from', 'memory', 'when', 'he', 'is', 'in', 'college']


In [39]:
w2v_model = gensim.models.Word2Vec(list_of_sent,min_count=5,size=50,workers=4)
# min_count means if the words does not occur atleast 5 times then dont construct w2v for it.
# size means what dimensional vector do you want of the word.

In [40]:
words = list(w2v_model.wv.vocab)
print(len(words))
# I can get dict of all the words I have.

33783


In [41]:
w2v_model.wv.most_similar('tasty')
# Finding similar words for tasty in our corpus (review dataset).

  if np.issubdtype(vec.dtype, np.int):


[('tastey', 0.9078287482261658),
 ('satisfying', 0.8583592176437378),
 ('yummy', 0.8533511161804199),
 ('delicious', 0.8193768262863159),
 ('filling', 0.8150377869606018),
 ('flavorful', 0.7987992763519287),
 ('nutritious', 0.7558767199516296),
 ('addicting', 0.7549490332603455),
 ('delish', 0.7477218508720398),
 ('delectable', 0.7432020902633667)]

In [42]:
w2v_model.wv.most_similar('like')

  if np.issubdtype(vec.dtype, np.int):


[('resemble', 0.7128250002861023),
 ('dislike', 0.6608422994613647),
 ('mean', 0.6542459726333618),
 ('prefer', 0.6403476595878601),
 ('overpower', 0.6094322204589844),
 ('think', 0.6085252165794373),
 ('enjoy', 0.5944650173187256),
 ('miss', 0.5902699828147888),
 ('overwhelm', 0.5893973708152771),
 ('expect', 0.5830036401748657)]

In [43]:
count_vect_feat = count_vect.get_feature_names()  # list of words.
print(count_vect_feat.index('like'))
print(count_vect_feat[1442686])

1442686
like


##Avg W2V and TFIDF W2V

In [44]:
# computing avf w2v for each review.
sent_vectors = [];  # avg-w2v for each sentence is stored in this
for sent in list_of_sent: # for each review
  sent_vec = np.zeros(50) # as word vectors are of zero length
  cnt_words = 0; # num of words with the valid vec in the sentence
  for words in sent:  # for each word in review
    try:
      vec = w2v_model.wv[word]   # computing w2v
      sent_vec += vec  # adding each to sentence vec
      con_words += 1
    except:
      pass
  sent_vec /= cnt_words  # divide it by no. of words
  sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

  del sys.path[0]


364171
50


In [None]:
tfidf_feat = tf_idf_vect.get_feature_names()  # tfidf words/col names
# final_tf_idf is the sparse matrix with row=sentence, col=word and cell_value.

tfidf_sent_vectors = [];  # the tfidf-w2v for each review is stored here.
row = 0;
for sent in list_of_sent:  # for each review
  sent_vec = np.zeros(50)  # as word vec are of zero length
  weight_sum = 0;  # num of words with a valid vec in the review
  for word in sent:
    try:
      vec = w2v_model.wv[word]
      # obtain the tfidf of a word in a sentence
      tfidf = final_tf_idf[row,tfidf_feat.index(word)]
      sent_vec += (vec * tf_idf)
      weight_sum += tf_idf
    except:
      pass
  sent_vec /= weight_sum
  tfidf_sent_vectors.append(sent_vec)
  row += 1

