## FastText

In [1]:
import pandas as pd
import zipfile
import glob
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import contractions  # to expand contracted words
from gensim.models import FastText
import re

lemmatizer = WordNetLemmatizer()  # lemmatization

In [2]:
# to extract the zip file
with zipfile.ZipFile("dataset.zip","r") as zp:
    zp.extractall("./")

In [2]:
files = glob.glob("main_product/*.csv")  # get all csvs in a list

In [3]:
# read all the CSVs
df = None
for file in files:
    frame = pd.read_csv(file, index_col=False, usecols=["reviews"])
    df = frame if df is None else pd.concat([df,frame])
del frame  # del frame object to free memory
print(f"size of dataset: {df.shape}")

size of dataset: (11929, 1)


## Processing
1. extract all the reviews
2. preprocess them by<br>
   a. removing punctuations <br>
   b. removing special characters <br>
   c. converting all reviews to lower case.

In [4]:
reviews = df["reviews"].to_list()

In [5]:
# custom stop words
my_stopwords = ['i','me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", 
                "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                'she', "she's", 'her', 'hers', 'herself', 'them', 'their', 'theirs', 'themselves', 'who', 'whom',
                'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
                'did', 'doing', 'a', 'an', 'the', 'and', 's', 't', 'd', 'll', 'm', 'o', 're', 've', 'y', 'in', 'ma', 'it'
               ]

In [6]:
# from nltk.stem import PorterStemmer 

# ps = PorterStemmer() 

# words = set(nltk.corpus.words.words())

def preprocess_text(text):
    text = re.sub("[^a-zA-Z']"," ",str(text))  # remove digits by replacing with a space
    text = re.sub(' +', ' ',text)  # remove multiple spaces
    text = text.replace(" ' ","'")  # make sure
    text = contractions.fix(text)  # expand the contracted words
    text_tokens = text.lower().split()
    words = [lemmatizer.lemmatize(word) for word in text_tokens if not word in my_stopwords]  # apply lemmatization
    return words

In [95]:
# all_stopwords = stopwords.words('english')
# all_stopwords

In [7]:
processed_text = list(map(preprocess_text, reviews))

In [8]:
# processed_text

## Train the model

In [9]:
model_ted = FastText(processed_text, size=100, window=5, min_count=5, workers=4,sg=1)

In [10]:
model_ted.most_similar("price")

  """Entry point for launching an IPython kernel.


[('pricy', 0.9623515605926514),
 ('pricey', 0.9523756504058838),
 ('priced', 0.9205743074417114),
 ('value', 0.8732026815414429),
 ('overpriced', 0.8606802821159363),
 ('money', 0.8554052710533142),
 ('paid', 0.8433690071105957),
 ('advice', 0.8413906097412109),
 ('pricing', 0.8355211615562439),
 ('device', 0.8193491697311401)]

In [25]:
model_ted.most_similar("rechargable", topn=50)

  """Entry point for launching an IPython kernel.


[('rechargables', 0.9797037839889526),
 ('rechargeable', 0.9658107757568359),
 ('rechargeables', 0.9628560543060303),
 ('chargeable', 0.9504367113113403),
 ('rechargeability', 0.9483517408370972),
 ('recharges', 0.9434765577316284),
 ('recharger', 0.9247744083404541),
 ('recharge', 0.9190071821212769),
 ('recharged', 0.911583662033081),
 ('remarkable', 0.8807905912399292),
 ('capable', 0.8802489042282104),
 ('recharging', 0.877159833908081),
 ('interchangeable', 0.873866856098175),
 ('useable', 0.8725010752677917),
 ('disposable', 0.8692629933357239),
 ('unable', 0.8608207702636719),
 ('dependable', 0.8604360818862915),
 ('valuable', 0.8569726943969727),
 ('common', 0.8566673994064331),
 ('understandable', 0.8533669710159302),
 ('detachable', 0.8516848087310791),
 ('disable', 0.8502037525177002),
 ('reputable', 0.849909782409668),
 ('convenience', 0.849128007888794),
 ('invaluable', 0.8467190265655518),
 ('available', 0.8457227945327759),
 ('acceptable', 0.844489574432373),
 ('charm', 

In [20]:
model_ted.similarity("price","sale")

  """Entry point for launching an IPython kernel.


0.6238024

In [33]:
model_ted.similarity("price","sturdy")

  """Entry point for launching an IPython kernel.


0.6245084

In [35]:
model_ted.similarity("rechargable","battery")

  """Entry point for launching an IPython kernel.


0.7044879

In [24]:
print([sent for sent in reviews if "sale" in str(sent)], sep="\n")

['Goodbye headlamps! Every flashlight I own fits this. ( variety of name brands not just Fenix). Whenever I was in any store that had headlamps for sale, I always ventured towards them. But I could never pass that line of buying one. Thank goodness! This product works amazing.', 'These are excellent lights. We have four of them now and we each carry one on our nightly dog walk around our place here in South Texas. The three power level capability is a definite plus and the highest power can really reach out there and illuminate the area. Nice thing, too, is you can use either the C123 or the rechargeable 18560 battery. I will probably acquire a couple more when I find them on sale. Also, nice carry pouch came with mine, so check to see if that is included when you order.', "High-output flashlights have been decreasing in price, and increasing in performance the past few years. My buddy had a Surefire and I was so envious, but could not justify the three-digit price. Once the Fenix PD32