In [1]:
import pandas as pd
import zipfile
import glob
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import FastText
import re
from tqdm import tqdm
from gensim.models import Word2Vec
tqdm.pandas()
import time

In [2]:
df = pd.read_csv("complete_dataset.csv")

In [3]:
reviews = df["reviews"]

In [4]:
stop_words = stopwords.words("english")

In [5]:
lemma = WordNetLemmatizer()
data = []
# seperate reviews into sentences
def textProcessing(inp):
    print("hello")
    sentences = sent_tokenize(str(inp).lower())
    
    for sentence in sentences:
#         print("sentence input: ",sentence)
        sent = re.sub(r"[^a-zA-z ]", '', sentence)
        sent = re.sub(r" +", ' ', sent)
        inp_split = sent.split()
#         print("splited input: ",inp_split)
        ns = [lemma.lemmatize(word) for word in inp_split if word not in stop_words]
        print(ns)
        data.append(ns)

In [52]:
start = time.time()
for review in reviews:
    textProcessing(review)
print(time.time() - start)

In [10]:
# data

In [11]:
model = Word2Vec(data, size=250, sg = 1)

In [12]:
def compute_similar_words(word, cutoff=0.70):
    similar_words = []
    try:
        similar_words = [item for item in model.wv.most_similar(word, topn=len(model.wv.vocab)) if item[1] >= cutoff]
    except KeyError:
        pass
    return similar_words

In [46]:
price = compute_similar_words("price")
size = compute_similar_words("size", cutoff=0.6)
quality = compute_similar_words("quality", cutoff=0.6)
battery = compute_similar_words("battery", cutoff=0.6)
design = compute_similar_words("design", cutoff=0.6)
beam = compute_similar_words("beam", cutoff=0.6)

In [44]:
beam

[('flood', 0.7934977412223816),
 ('focused', 0.7723909616470337),
 ('pattern', 0.7713639140129089),
 ('throw', 0.760788083076477),
 ('spot', 0.7566829323768616),
 ('spill', 0.7561757564544678),
 ('focus', 0.7559686303138733),
 ('concentrated', 0.7347534894943237),
 ('floody', 0.7339653372764587),
 ('wide', 0.7266631722450256),
 ('hotspot', 0.7265369296073914),
 ('width', 0.726333498954773),
 ('distance', 0.7200384736061096),
 ('central', 0.7195073962211609),
 ('cone', 0.7182235717773438),
 ('narrow', 0.7123903036117554),
 ('uniform', 0.7099975943565369),
 ('intense', 0.7093213796615601),
 ('angle', 0.7078049778938293),
 ('illumination', 0.7066079378128052),
 ('cast', 0.6992918848991394),
 ('focusing', 0.6974489092826843),
 ('view', 0.6897667646408081),
 ('circle', 0.6896113753318787),
 ('center', 0.6871466636657715),
 ('diffused', 0.6868414282798767),
 ('homogeneous', 0.6834366321563721),
 ('broad', 0.6823222041130066),
 ('emitted', 0.6823209524154663),
 ('balance', 0.6821483969688416)

In [47]:
len(price), len(size), len(quality), len(battery), len(design), len(beam)

(29, 423, 128, 80, 1124, 119)

In [48]:
keywords = {'size':size, 'quality':quality, 'battery':battery, 'design':design, 'beam':beam, 'price': price}

In [24]:
def checkPresence(sentence, words):
    for keyword in words:
        if keyword[0] in word_tokenize(sentence):
            return True
    return False

def get_aspects(reviews_in_sentences, keywords):
    ret = []
    for sentences in reviews_in_sentences:
        filtered = ''
        for sentence in sentences:
            if checkPresence(sentence, keywords):
                filtered += sentence 
        ret.append(filtered)
    return ret

def get_filtered_aspects(review):
    reviews_in_sentences = sent_tokenize(str(review).lower())
    ans = {}
    for sentence in reviews_in_sentences:
        for name, keyword_list in keywords.items():
            flag = checkPresence(sentence, keyword_list)
            if flag:
                try:
                    ans[sentence].append(name)
                except KeyError:
                    ans[sentence] = [name]
    return ans

In [20]:
reviews_in_sentences = [sent_tokenize(str(review).lower()) for review in reviews]

In [21]:
df = df[['reviews']]

In [49]:
start = time.time()

df['size'] = get_aspects(reviews_in_sentences, size)
df['quality'] = get_aspects(reviews_in_sentences, quality)
df['battery'] = get_aspects(reviews_in_sentences, battery)
df['design'] = get_aspects(reviews_in_sentences, design)
df['beam'] = get_aspects(reviews_in_sentences, beam)
df['price'] = get_aspects(reviews_in_sentences, price)
end = time.time()
print(f"Took {end - start} seconds to match sentences into aspects.")

Took 8867.744539737701 seconds to match sentences into aspects.


In [50]:
df.to_csv("word2vec_sentence_output_0.7.csv")

## Fasttext

In [51]:
from gensim.models import FastText

In [53]:
model = FastText(data, size=100, window=5, min_count=5, workers=4,sg=1)

In [54]:
price = compute_similar_words("price")
size = compute_similar_words("size")
quality = compute_similar_words("quality")
battery = compute_similar_words("battery")
design = compute_similar_words("design")
beam = compute_similar_words("beam")

In [55]:
len(price), len(size), len(quality), len(battery), len(design), len(beam)

(81, 157, 107, 40, 537, 40)

In [61]:
price

[('pricey', 0.9515231251716614),
 ('pricy', 0.9504089951515198),
 ('priced', 0.8976841568946838),
 ('priceperformance', 0.8945684432983398),
 ('performance', 0.826082170009613),
 ('value', 0.8240972757339478),
 ('overpriced', 0.8200380802154541),
 ('expense', 0.80781489610672),
 ('highperformance', 0.8076149225234985),
 ('cost', 0.7950354814529419),
 ('money', 0.7908743023872375),
 ('expensive', 0.7870935201644897),
 ('pay', 0.7866541147232056),
 ('expert', 0.7792508602142334),
 ('prime', 0.7784606218338013),
 ('advice', 0.7713238596916199),
 ('premium', 0.7664700150489807),
 ('decisive', 0.7647730708122253),
 ('ratio', 0.7636321187019348),
 ('reality', 0.7632865905761719),
 ('convince', 0.7604442834854126),
 ('pricing', 0.7601076364517212),
 ('justify', 0.760007381439209),
 ('experience', 0.7578933835029602),
 ('euro', 0.7547465562820435),
 ('opinion', 0.7535992860794067),
 ('conscience', 0.7527101039886475),
 ('inexpensive', 0.7521642446517944),
 ('priority', 0.7515246868133545),
 ('

In [58]:
df = df[['reviews']]

In [59]:
start = time.time()

df['size'] = get_aspects(reviews_in_sentences, size)
df['quality'] = get_aspects(reviews_in_sentences, quality)
df['battery'] = get_aspects(reviews_in_sentences, battery)
df['design'] = get_aspects(reviews_in_sentences, design)
df['beam'] = get_aspects(reviews_in_sentences, beam)
df['price'] = get_aspects(reviews_in_sentences, price)
end = time.time()
print(f"Took {end - start} seconds to match sentences into aspects.")

Took 5060.366037130356 seconds to match sentences into aspects.


In [60]:
df.to_csv("ft_sentence_output.csv")