## FastText

In [1]:
import pandas as pd
import zipfile
import glob
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import contractions  # to expand contracted words
from gensim.models import FastText
import re

lemmatizer = WordNetLemmatizer()  # lemmatization

In [2]:
# to extract the zip file
with zipfile.ZipFile("dataset.zip","r") as zp:
    zp.extractall("./")

In [2]:
files = glob.glob("main_product/*.csv")  # get all csvs in a list

In [3]:
# read all the CSVs
df = None
for file in files:
    frame = pd.read_csv(file, index_col=False, usecols=["reviews"])
    df = frame if df is None else pd.concat([df,frame])
del frame  # del frame object to free memory
print(f"size of dataset: {df.shape}")

size of dataset: (11790, 1)


## Processing
1. extract all the reviews
2. preprocess them by<br>
   a. removing punctuations <br>
   b. removing special characters <br>
   c. converting all reviews to lower case.

In [4]:
reviews = df["reviews"].to_list()

In [5]:
all_stopwords = stopwords.words('english')  # nltk stopwords list
# custom stop words
# my_stopwords = ['i','me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", 
#                 "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
#                 'she', "she's", 'her', 'hers', 'herself', 'them', 'their', 'theirs', 'themselves', 'who', 'whom',
#                 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
#                 'did', 'doing', 'a', 'an', 'the', 'and', 's', 't', 'd', 'll', 'm', 'o', 're', 've', 'y', 'in', 'ma', 'it'
#                ]

In [16]:
# process text
def preprocess_text(text):
    text = re.sub("[^a-zA-Z']"," ",str(text))  # remove digits by replacing with a space
    text = re.sub(' +', ' ',text)  # remove multiple spaces
    text = text.replace(" ' ","'")  # make sure
    text = contractions.fix(text)  # expand the contracted words
    text_tokens = text.lower().split()
    words = [lemmatizer.lemmatize(word) for word in text_tokens if (not word in all_stopwords) and (len(word) > 2)]  # apply lemmatization
    return words

In [68]:
list(map(preprocess_text, ["This is how it works. #and that it behaves."]))

[['work', 'behaves']]

In [17]:
processed_text = list(map(preprocess_text, reviews))

In [18]:
processed_text

[['bought',
  'light',
  'wife',
  'visually',
  'impaired',
  'peripheral',
  'vision',
  'light',
  'gathering',
  'ability',
  'attach',
  'light',
  'white',
  'cane',
  'help',
  'see',
  'seen',
  'walking',
  'night',
  'light',
  'bright',
  'enough',
  'beam',
  'perfect',
  'need',
  'however',
  'started',
  'problem',
  'light',
  'switch',
  'right',
  'box',
  'since',
  'light',
  'important',
  'wife',
  'safety',
  'buying',
  'steamlight',
  'size',
  'update',
  'ordered',
  'light',
  'got',
  'switch',
  'figured',
  'quit',
  'working',
  'last',
  'week',
  'le',
  'year',
  'use',
  'guess',
  'would',
  'upgrade',
  'three',
  'star',
  'worked',
  'well',
  'wife',
  'almost',
  'year',
  'bright',
  'larger',
  'beam',
  'radius',
  'perfect',
  'need'],
 ['got',
  'keep',
  'glovebox',
  'built',
  'well',
  'take',
  'beating',
  'bright',
  'enough',
  'need',
  'well',
  'rechargeable',
  'battery',
  'say',
  'worth'],
 ['nice',
  'quality',
  'light',
 

## Train the model

#### Parameters:

1. size: Dimension of the word vectors. <br>
2. window: number of words considered before and after the target word.<br>
3. min_count: It discards the words with frequency less than the min_count value. Hence, it prevents the rare words taken into    consideration and also helps to reduce the memory.

### 1. Window = 5

In [41]:
model_win5 = FastText(processed_text, size=100, window=5, min_count=5, workers=4,sg=1)

In [47]:
model_win5.most_similar("price")

  """Entry point for launching an IPython kernel.


[('pricey', 0.941415548324585),
 ('pricy', 0.9244195818901062),
 ('priced', 0.9059938788414001),
 ('overpriced', 0.8302311897277832),
 ('cost', 0.7995840311050415),
 ('expensive', 0.7729748487472534),
 ('money', 0.768273651599884),
 ('inexpensive', 0.7598073482513428),
 ('pricing', 0.7509361505508423),
 ('value', 0.7498012185096741)]

In [43]:
model_win5.most_similar("quality")

  """Entry point for launching an IPython kernel.


[('reality', 0.8850806951522827),
 ('equal', 0.8124454021453857),
 ('solidity', 0.8110880851745605),
 ('reliability', 0.7787283062934875),
 ('value', 0.7751495242118835),
 ('valid', 0.7699955701828003),
 ('justify', 0.7699731588363647),
 ('functionality', 0.7599197626113892),
 ('performance', 0.7563588619232178),
 ('conscience', 0.7348688840866089)]

In [44]:
model_win5.similarity("price","quality")

  """Entry point for launching an IPython kernel.


0.58979446

In [45]:
model_win5.similarity("rechargable","battery")

  """Entry point for launching an IPython kernel.


0.6313592

### 2. Window = 2

In [48]:
model_win2 = FastText(processed_text, size=100, window=2, min_count=5, workers=4,sg=1)

In [49]:
model_win2.most_similar("price")

  """Entry point for launching an IPython kernel.


[('pricey', 0.9563625454902649),
 ('pricy', 0.9449734091758728),
 ('priced', 0.908804714679718),
 ('money', 0.8428410887718201),
 ('overpriced', 0.8400766849517822),
 ('advice', 0.828889787197113),
 ('cost', 0.8269668817520142),
 ('experience', 0.8245160579681396),
 ('ice', 0.8216390609741211),
 ('priority', 0.8215767741203308)]

In [50]:
model_win2.most_similar("quality")

  """Entry point for launching an IPython kernel.


[('reality', 0.9296325445175171),
 ('solidity', 0.8867693543434143),
 ('reliability', 0.8650084733963013),
 ('equal', 0.8587222099304199),
 ('workmanship', 0.8528103828430176),
 ('performance', 0.8450403213500977),
 ('versatility', 0.8405190706253052),
 ('utility', 0.8347275257110596),
 ('facility', 0.8331247568130493),
 ('valid', 0.8310908675193787)]

In [51]:
model_win2.similarity("price","quality")

  """Entry point for launching an IPython kernel.


0.7556686

In [52]:
model_win2.similarity("battery","rechargable")

  """Entry point for launching an IPython kernel.


0.67532223

In [25]:
print([sent for sent in reviews if "sale" in str(sent)], sep="\n")

["The product is valid (5 stars for him), well done I recommend it. Excellent delivery times. The description mentions 2 Duracell batteries included in the sales package. They weren't in mine. I would have bought it anyway but to be fair, if you don't attach them anymore, remove them from the list. Regards", 'These are excellent lights. We have four of them now and we each carry one on our nightly dog walk around our place here in South Texas. The three power level capability is a definite plus and the highest power can really reach out there and illuminate the area. Nice thing, too, is you can use either the C123 or the rechargeable 18560 battery. I will probably acquire a couple more when I find them on sale. Also, nice carry pouch came with mine, so check to see if that is included when you order.', "High-output flashlights have been decreasing in price, and increasing in performance the past few years. My buddy had a Surefire and I was so envious, but could not justify the three-di

In [27]:
print([sent for sent in reviews if "rechargable" in str(sent)], sep="\n")

["Three years and still going strong. I've carried this flashlight with me daily on my job for 3 years now and it still works just as well as the day i got it. Super bright, different modes. Its been dropped a handfull of times and srill works like a charm. i use rechargable batteries by nighthawk i think and they get the job done. Buy this flashlight!! Its a lot of moneh but you wont have to replace it and it is a great flashlight.", 'I use the light almost daily at work, school and at home. The three different modes of lighting are easy to manipulate and are very useful in their own application. For the money, this is very hard to match!\n\nPros: SMALL, relatively light, friendly controls, CLIP, brightness levels, clean beam pattern\n\nCons: Uses CR batteries, nonrechargable', "The operation of this flashlight is still pretty confusing to me...untighten and toggle through modes by off/on while tightening gives the brightest light. It goes through my rechargables like nobody's busines

### 3. Mean count = 2

In [53]:
model_count2 = FastText(processed_text, size=100, window=2, min_count=2, workers=4,sg=1)

In [55]:
model_count2.most_similar("price")

  """Entry point for launching an IPython kernel.


[('pricey', 0.9635614156723022),
 ('pricy', 0.9544072151184082),
 ('priced', 0.9134051203727722),
 ('priceless', 0.9002936482429504),
 ('ice', 0.8568904399871826),
 ('advice', 0.8550175428390503),
 ('prize', 0.8528141379356384),
 ('overpriced', 0.8510251045227051),
 ('money', 0.8455267548561096),
 ('choice', 0.844268798828125)]

In [57]:
model_count2.most_similar("rechargable")

  """Entry point for launching an IPython kernel.


[('rechargables', 0.9927813410758972),
 ('rechargeable', 0.9875750541687012),
 ('rechargeables', 0.9855902194976807),
 ('chargable', 0.9805705547332764),
 ('chargeable', 0.9755738377571106),
 ('rechargeability', 0.9659920930862427),
 ('recharges', 0.9624892473220825),
 ('recharger', 0.9468436241149902),
 ('recharge', 0.9324887990951538),
 ('recharged', 0.9240112900733948)]

In [58]:
model_count2.similarity("rechargable","battery")

  """Entry point for launching an IPython kernel.


0.7273798

In [59]:
model_count2.similarity("price","quality")

  """Entry point for launching an IPython kernel.


0.7795054

### 4. Mean count = 7

In [54]:
model_count7 = FastText(processed_text, size=100, window=2, min_count=7, workers=4,sg=1)

In [60]:
model_count7.most_similar("price")

  """Entry point for launching an IPython kernel.


[('pricey', 0.9601843357086182),
 ('pricy', 0.9456526637077332),
 ('priced', 0.9029900431632996),
 ('value', 0.8416017889976501),
 ('money', 0.834578812122345),
 ('overpriced', 0.8335549831390381),
 ('advice', 0.8191428184509277),
 ('performance', 0.8085669279098511),
 ('opinion', 0.8078505396842957),
 ('ice', 0.8042752742767334)]

In [61]:
model_count7.most_similar("quality")

  """Entry point for launching an IPython kernel.


[('reality', 0.9223815202713013),
 ('reliability', 0.8505829572677612),
 ('equal', 0.8415118455886841),
 ('performance', 0.8379296064376831),
 ('facility', 0.8334949016571045),
 ('workmanship', 0.8321267366409302),
 ('valid', 0.8308918476104736),
 ('utility', 0.8304545879364014),
 ('versatility', 0.8291388750076294),
 ('priority', 0.8214733600616455)]

In [62]:
model_count7.similarity("price","quality")

  """Entry point for launching an IPython kernel.


0.71483266

In [63]:
model_count7.similarity("rechargable","battery")

  """Entry point for launching an IPython kernel.


0.6524199

In [66]:
model_count7.similarity("battery","durability")

  """Entry point for launching an IPython kernel.


0.6397928