# Objective :To Predict Rating given product Reviews on  Amazon

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

import sqlite3

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec


  import pandas.util.testing as tm


In [6]:
#Using the SQLITE Table to read the data

con=sqlite3.connect(r"amazon-fine-food-reviews\database.sqlite")

# Filtering only positive or negatove reviews
#i.e Taking reviews with rating 1 or 2(Negative) and 4 or 5(Positive)

filtered_data=pd.read_sql_query("""
SELECT *
FROM REVIEWS
WHERE SCORE !=3
""",con)

In [7]:
#Give reviews with rating<3 a negative rating and greater than 3 a positive rating

def partition(x):
    if x>3:
        return 'positive'
    return 'negative'

#Changing Score with less than 3 to be negative and vice versa

actualscore=filtered_data['Score']
#print(actualscore)
PositiveNegative=actualscore.map(partition)
#print(PositiveNegative)

filtered_data['Score']=PositiveNegative


In [8]:
#Looking at the shape and attributes of the data
print(filtered_data.shape)
filtered_data.head()

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Exploratory Data Analysis


# Data Cleaning : Deduplication 

It is observed (as shown in the table below) that the reviews data had many duplicate entries. Hence it was necessary to remove duplicates in order to get unbiased results for the analysis of the data. Following is an example:

In [9]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


As can be seen above the same user has multiple reviews of the with the same values for HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary and Text  and on doing analysis it was found that <br>
<br> 
ProductId=B000HDOPZG was Loacker Quadratini Vanilla Wafer Cookies, 8.82-Ounce Packages (Pack of 8)<br>
<br> 
ProductId=B000HDL1RQ was Loacker Quadratini Lemon Wafer Cookies, 8.82-Ounce Packages (Pack of 8) and so on<br>

It was inferred after analysis that reviews with same parameters other than ProductId belonged to the same product just having different flavour or quantity. Hence in order to reduce redundancy it was decided to eliminate the rows having same parameters.<br>

The method used for the same was that we first sort the data according to ProductId and then just keep the first similar product review and delelte the others. for eg. in the above just the review for ProductId=B000HDL1RQ remains. This method ensures that there is only one representative for each product and deduplication without sorting would lead to possibility of different representatives still existing for the same product.

In [10]:
#Sorting Data according to ProductID in ascending order

sorted_data=filtered_data.sort_values('ProductId',axis=0,ascending=True, inplace=False, kind='quicksort', na_position='last')
print(sorted_data.shape)

(525814, 10)


In [11]:
#Deduplication of Entries

final=sorted_data.drop_duplicates(subset={'UserId','ProfileName','Time','Text','Summary','Score'},keep='first',inplace=False)

print(final.shape)

(365334, 10)


In [12]:
# Checking how much of the data remains

(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)  *100


69.47970194783707

Observation:- It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions

In [13]:
display=pd.read_sql_query("""
SELECT *
FROM REVIEWS
WHERE SCORE!=3 AND Id=44737 OR Id=64422
ORDER BY ProductId
""",con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [14]:
final=final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]

In [15]:
#Lets see the number of entries now
print(final.shape)

#To count the no. of positive and negative reviews.
final['Score'].value_counts()

(365332, 10)


positive    307967
negative     57365
Name: Score, dtype: int64

# Bag of Words(BoW)

In [16]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc..."
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...


In [17]:
count_vect=CountVectorizer() #in scikit learn
final_counts=count_vect.fit_transform(final['Text'].values)


In [18]:
type(final_counts)

#From the output you can see that the COuntVectorizer produces a sparse representation of counts

scipy.sparse.csr.csr_matrix

In [19]:
final_counts.shape

(365332, 115281)

# Text Pre-processing

In [20]:
import re

import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop=set(stopwords.words('english')) #set of stopwords
#print(stop)
sno=nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer


In [21]:
#function to clean the word of any HTMl tag

def cleanHTML(sentence):
    cleanr=re.compile('<.*?>')
    cleantext=re.sub(cleanr,' ',sentence)
    return cleantext

#function to clean the word of any punctuation:

def cleanpunc(sentence):
    cleaned=re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned=re.sub(r'[.|,|)|(|/|\]',r' ',cleaned)
    return cleaned

print(stop)
print("*"*30)
print(sno.stem('tasty'))

{'below', 'an', 'through', 'and', 'being', 'again', "haven't", 'any', 'so', 'itself', 've', 'why', 'what', 'shan', 'after', "won't", 'hadn', 'am', 'does', 'doing', "aren't", 'most', 'he', 'into', 'had', 'a', 'its', "that'll", 'from', 'under', 'having', 'once', 'some', 'our', "you've", 'will', 'about', 'until', "should've", 'shouldn', 'couldn', 'ours', 'no', 'where', 'it', 'of', 'she', 'y', 'm', "you'll", 'wouldn', 't', 'against', 'mustn', 'myself', 'yourself', 'her', 'them', 'at', 'up', "wasn't", "wouldn't", 'we', 'can', 'all', 'yourselves', 'whom', 'isn', 'more', 'i', "hasn't", 'your', 're', 'such', 'o', 'been', "weren't", 'doesn', 'how', "shan't", "needn't", 'which', 'between', 'by', 'ma', 'have', 'are', "couldn't", 'during', 'while', 'just', 'needn', 'don', 'mightn', "didn't", 'himself', 'ain', 'weren', "it's", 'haven', "she's", 'each', 'then', 'other', 'didn', 'few', 'won', 'do', 'both', 's', 'these', 'there', 'nor', 'hasn', "doesn't", 'those', 'own', 'were', "mightn't", 'who', 'hi

In [22]:
# printing some random reviews
sent_0 = final['Text'].values[0]
print(sent_0)
print("="*50)

sent_1000 = final['Text'].values[1000]
print(sent_1000)
print("="*50)

sent_1500 = final['Text'].values[1500]
print(sent_1500)
print("="*50)

sent_4900 = final['Text'].values[4900]
print(sent_4900)
print("="*50)

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
For years I have been trying to simulate a truly Italian espresso or cappuccino without success.  Several packages arrived [...] just before Christmas.  My son had sent a Briel Cadiz Espresso machine, two lovely Russian china cups and a case of Espressione Classic Espresso Pods.  It was the kindest, most wonderful Christmas present I can remember.  The espresso machine excels in quality and ease of use.  The espresso is exquisite.  Each time I prepare an espresso or cappuccino I experience my son's loving kindness and a small part of the many gifts of Italy.  Heaven on Earth.
I have had both of our indoor cats on thi

In [23]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [24]:
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [25]:
sent_1500 = re.sub('[^A-Za-z0-9]+', ' ', sent_1500) # this means substituting everything except A-Za-z0-9.(^means not)
print(sent_1500)

I have had both of our indoor cats on this food for the past two years It was recommended by one of the staff at Petsmart It is great for keeping their coats shiny and soft plus Ginger and Oliver love it One of our cats is prone to really dry skin and this food has really helped I m thrilled to find I can order this on line because it s sometimes difficult to find or sold out of in the petstores 


In [26]:
from tqdm import tqdm
from bs4 import BeautifulSoup

preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(final['Text'].values):
    sentance = re.sub(r"http\S+", "", sentance)           # remove urls from text python
    sentance = BeautifulSoup(sentance, 'lxml').get_text() # python-beautifulsoup-how-to-remove-all-tags-from-an-element
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()   #remove words with numbers python
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)        #remove spacial character
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sentance.strip())

100%|████████████████████████████████████████████████████████████████████████| 365332/365332 [02:09<00:00, 2811.58it/s]


# Bag of Words

In [27]:

count_vect=CountVectorizer() #in scikit learn
final_counts=count_vect.fit_transform(final['Text'].values)

print(type(final_counts))

#From the output you can see that the COuntVectorizer produces a sparse representation of counts


final_counts.shape

<class 'scipy.sparse.csr.csr_matrix'>


(365332, 115281)

# Ni-grams and n-grams


In [28]:
#bi-gram, tri-gram and n-gram

#removing stop words like "not" should be avoided before building n-grams
# count_vect = CountVectorizer(ngram_range=(1,2))
# please do read the CountVectorizer documentation http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# you can choose these numebrs min_df=10, max_features=5000, of your choice
count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram_counts = count_vect.fit_transform(preprocessed_reviews)
print("the type of count vectorizer ",type(final_bigram_counts))
print("the shape of out text BOW vectorizer ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_bigram_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (365332, 3923364)
the number of unique words including both unigrams and bigrams  3923364


# TF-IDF

In [30]:
tf_idf_vect=TfidfVectorizer(ngram_range=(1,2))
final_tf_idf=tf_idf_vect.fit_transform(preprocessed_reviews)

In [31]:
final_tf_idf.shape

(365332, 3923364)

# Word2Vec

In [32]:
# Train your own Word2Vec model using your own text corpus
i=0
list_of_sentance=[]
for sentance in preprocessed_reviews:
    list_of_sentance.append(sentance.split())

In [33]:
# Using Google News Word2Vectors

# in this project we are using a pretrained model by google
# its 3.3G file, once you load this into your memory 
# it occupies ~9Gb, so please do this step only if you have >12G of ram
# we will provide a pickle file wich contains a dict , 
# and it contains all our courpus words as keys and  model[word] as values
# To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 
# from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# it's 1.9GB in size.


# http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.W17SRFAzZPY
# you can comment this whole cell
# or change these varible according to your need

''''is_your_ram_gt_16g=False
want_to_use_google_w2v = False
want_to_train_w2v = True

if want_to_train_w2v:
    # min_count = 5 considers only words that occured atleast 5 times
    w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
    print(w2v_model.wv.most_similar('great'))
    print('='*50)
    print(w2v_model.wv.most_similar('worst'))
    
elif want_to_use_google_w2v and is_your_ram_gt_16g:
    if os.path.isfile('GoogleNews-vectors-negative300.bin'):
        w2v_model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        print(w2v_model.wv.most_similar('great'))
        print(w2v_model.wv.most_similar('worst'))
    else:
        print("you don't have gogole's word2vec file, keep want_to_train_w2v = True, to train your own w2v ") '''

'\'is_your_ram_gt_16g=False\nwant_to_use_google_w2v = False\nwant_to_train_w2v = True\n\nif want_to_train_w2v:\n    # min_count = 5 considers only words that occured atleast 5 times\n    w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)\n    print(w2v_model.wv.most_similar(\'great\'))\n    print(\'=\'*50)\n    print(w2v_model.wv.most_similar(\'worst\'))\n    \nelif want_to_use_google_w2v and is_your_ram_gt_16g:\n    if os.path.isfile(\'GoogleNews-vectors-negative300.bin\'):\n        w2v_model=KeyedVectors.load_word2vec_format(\'GoogleNews-vectors-negative300.bin\', binary=True)\n        print(w2v_model.wv.most_similar(\'great\'))\n        print(w2v_model.wv.most_similar(\'worst\'))\n    else:\n        print("you don\'t have gogole\'s word2vec file, keep want_to_train_w2v = True, to train your own w2v ") '

In [34]:
# min_count = 5 considers only words that occured atleast 5 times
import time
start=time.time()
w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50,workers=4)
end=time.time()
print(end-start)
print(w2v_model.wv.most_similar('great'))
print('='*50)
print(w2v_model.wv.most_similar('worst'))

97.46036911010742
[('terrific', 0.886653482913971), ('fantastic', 0.8813036680221558), ('good', 0.8579261302947998), ('awesome', 0.8562257289886475), ('excellent', 0.8557323217391968), ('wonderful', 0.8217575550079346), ('perfect', 0.7775750160217285), ('amazing', 0.7562124729156494), ('nice', 0.7557616233825684), ('fabulous', 0.7429346442222595)]
[('nastiest', 0.8756100535392761), ('greatest', 0.7705017328262329), ('disgusting', 0.7463821172714233), ('terrible', 0.7238553762435913), ('vile', 0.7149108648300171), ('horrible', 0.7114402651786804), ('best', 0.7105622291564941), ('awful', 0.6984840631484985), ('tastiest', 0.6724386215209961), ('horrid', 0.670474648475647)]


In [35]:
w2v_words=list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])



number of words that occured minimum 5 times  33679
sample words  ['witty', 'little', 'book', 'makes', 'son', 'laugh', 'loud', 'recite', 'car', 'driving', 'along', 'always', 'sing', 'refrain', 'learned', 'whales', 'india', 'drooping', 'roses', 'love', 'new', 'words', 'introduces', 'silliness', 'classic', 'willing', 'bet', 'still', 'able', 'memory', 'college', 'grew', 'reading', 'sendak', 'books', 'watching', 'really', 'rosie', 'movie', 'incorporates', 'loves', 'however', 'miss', 'hard', 'cover', 'version', 'seem', 'kind', 'flimsy', 'takes']


# Converting text into vectors using wAvg W2V, TFIDF-W2V 

# Average Word2Vec

In [36]:
# average Word2Vec
# compute average word2vec for each review.

sent_vectors=[]  # the avg-w2v for each sentence/review is stored in this list

for sent in tqdm(list_of_sentance): # for each review/sentence
    sent_vec=np.zeros(50)           # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words=0                     # num of words with a valid vector in the sentence/review
    for word in sent:               # for each word in a review/sentence
        if word in w2v_words:
            vec=w2v_model.wv[word]
            sent_vec+=vec
            cnt_words+=1
    
    if cnt_words!=0:
        sent_vec/=cnt_words
    sent_vectors.append(sent_vec)
    
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|█████████████████████████████████████████████████████████████████████████| 365332/365332 [16:59<00:00, 358.31it/s]


365332
50


# TF-IDF weighted W2V

In [37]:
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
model = TfidfVectorizer()
model.fit(preprocessed_reviews)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [38]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sentance): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

 69%|██████████████████████████████████████████████▉                     | 252002/365332 [27:22:56<64:22:49,  2.05s/it]

KeyboardInterrupt: 