In [1]:
! pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.2-py3-none-any.whl.metadata (1.4 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading reportlab-4.2.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet, reportlab
Successfully installed chardet-5.2.0 reportlab-4.2.2


# **Pre-Trained Model Installation**

In [2]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
! gunzip "cc.en.300.bin.gz"

--2024-07-22 14:31:06--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 108.158.20.120, 108.158.20.111, 108.158.20.21, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|108.158.20.120|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: 'cc.en.300.bin.gz'


2024-07-22 14:42:08 (6.49 MB/s) - 'cc.en.300.bin.gz' saved [4503593528/4503593528]



In [3]:
import os
import re
import spacy
import random
import numpy as np 
import pandas as pd
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")
from gensim.models.fasttext import FastText
from gensim.models.fasttext import load_facebook_model
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from tabulate import tabulate
from gensim.models import Word2Vec
import fasttext.util
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# **Access tips file only**

In [4]:
file_path = '/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json'
tips_df = pd.read_json(file_path, lines=True)
text_tips_df = tips_df['text']
for idx, text in enumerate(text_tips_df[:20]):
    print(f"Text {idx + 1}: {text}")

Text 1: Avengers time with the ladies.
Text 2: They have lots of good deserts and tasty cuban sandwiches
Text 3: It's open even when you think it isn't
Text 4: Very decent fried chicken
Text 5: Appetizers.. platter special for lunch
Text 6: Chili Cup + Single Cheeseburger with onion, pickle, and relish + Vanilla Coca-Cola...so far.
Text 7: Saturday, Dec 7th 2013, ride Patco's Silver Sleigh w/ Santa & his elves on a decorated train into Center City. Trains leave from Lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. Great for kids!
Text 8: This is probably the best place in the cool Springs area to watch a game and eat
Text 9: Tacos
Text 10: Starbucks substitute in boring downtown Tampa. Ugh. Never again!
Text 11: Order the Tortilla Soup
Text 12: Very good will definitely be coming back!!
Text 13: If the Hotlight is on you must stop in.
Text 14: Let's go Yankees!
Text 15: Basically same food as rally's for $5 more
Text 16: Don't go for dinner. They close at 6. Really Yvonne L?

In [5]:
text_tips_df.shape

(908915,)

In [6]:
corpus = text_tips_df.head(10000)
max_length = max(len(sentence.split()) for sentence in corpus)
max_length

98

# **Preprocessing**

In [7]:
def clean_text(document):
    clean_words = []
   # Process the text using spaCy
    doc = nlp(document)
    # Extract lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in doc] #return words to their base or dictionary form, known as the lemma.
    # Join the lemmatized tokens into a sentence
    cleaned_doc = ' '.join(lemmatized_tokens)
    cleaned_doc = cleaned_doc.lower()
    # Remove links and emails
    cleaned_doc = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', cleaned_doc)
    # Remove any thing except words, numbers, and space
    cleaned_doc = re.sub(r'[^\w\s]', '', cleaned_doc)
    # Remove numbers
    cleaned_doc = re.sub(r'[0-9]', '', cleaned_doc)
    # Remove emojis and non ascii characters
    cleaned_doc = re.sub(r'[^\x00-\x7F]+', '', cleaned_doc)
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    cleaned_doc = ' '.join([word for word in cleaned_doc.split() if word not in stop_words])
    cleaned_doc = ' '.join([word for word in cleaned_doc.split() if (len(word) > 3)])
    # Split sentences to words
    clean_words.extend(cleaned_doc.split())
    return  clean_words

In [8]:
clean_text = [clean_text(sentence) for sentence in corpus]

In [9]:
clean_text[0]

['avenger', 'time', 'lady']

# **Customed FastText Model**

In [10]:
def train_Fasttext(sentences,embedding_size,window_size,min_word,down_sampling,epochs,Save_model_filename):
    fast_Text_model = FastText(sentences,
    vector_size=embedding_size, 
    window=window_size,
    min_count=min_word, 
    sample=down_sampling, 
    workers = 4, 
    sg=1, 
    epochs=epochs) 

    fast_Text_model.save(Save_model_filename) # Save fastText gensim model

In [11]:
embedding_size = 300
window_size = 3
min_word = 1
down_sampling = 1e-2
epochs=300

train_Fasttext(clean_text,embedding_size,window_size,min_word,down_sampling,epochs,"Custom_FastText")

In [12]:
custom_model = Word2Vec.load("/kaggle/working/Custom_FastText")

# **Compare the most similar 10 words for a specific word (Custom vs Pre-trained)**

In [13]:
print(custom_model.wv.most_similar(positive=["important"], topn=10))

[('importantly', 0.8232631683349609), ('import', 0.7516825199127197), ('elegant', 0.5730699300765991), ('porta', 0.571188747882843), ('infant', 0.5711729526519775), ('abundant', 0.5487837195396423), ('zioski', 0.5413228273391724), ('portabello', 0.5408158898353577), ('vonnegut', 0.5390135049819946), ('attendant', 0.5304132699966431)]


In [14]:
pretrained_facebook = load_facebook_model('/kaggle/working/cc.en.300.bin')

In [15]:
print(pretrained_facebook.wv.most_similar(positive=["important"], topn=10))

[('vitally', 0.8139428496360779), ('crucial', 0.7995947599411011), ('imporant', 0.7556751370429993), ('vital', 0.7363570928573608), ('improtant', 0.6904195547103882), ('essential', 0.6811777353286743), ('imortant', 0.654327392578125), ('importnat', 0.652055561542511), ('impotant', 0.6465452909469604), ('important.What', 0.6370490193367004)]


In [16]:
from gensim.models import KeyedVectors
from prettytable import PrettyTable

In [17]:
def get_similar_words(model, word, topn=10):
    similar_words = model.wv.most_similar(positive=[word], topn=topn)
    least_similar_words = model.wv.most_similar(negative=[word], topn=topn)
    return similar_words, least_similar_words


In [18]:
def print_similarity_table(word, similar_words, least_similar_words):
    table = PrettyTable()
    table.field_names = ["Similar Words", "Least Similar Words"]
    
    for i in range(len(similar_words)):
        similar = f"{similar_words[i][0]} ({similar_words[i][1]:.4f})"
        least_similar = f"{least_similar_words[i][0]} ({least_similar_words[i][1]:.4f})"
        table.add_row([similar, least_similar])
    
    print(f"Word: {word}")
    print(table)

word = "important"
similar_words_custom, least_similar_words_custom = get_similar_words(custom_model, word)
similar_words_facebook, least_similar_words_facebook = get_similar_words(pretrained_facebook, word)

print("Custom Model")
print_similarity_table(word, similar_words_custom, least_similar_words_custom)
print("\nPre-trained Facebook Model")
print_similarity_table(word, similar_words_facebook, least_similar_words_facebook)

Custom Model
Word: important
+----------------------+---------------------+
|    Similar Words     | Least Similar Words |
+----------------------+---------------------+
| importantly (0.8233) |   choose (-0.0887)  |
|   import (0.7517)    |    type (-0.0932)   |
|   elegant (0.5731)   |   flavor (-0.0933)  |
|    porta (0.5712)    |    full (-0.0998)   |
|   infant (0.5712)    |   snack (-0.1144)   |
|  abundant (0.5488)   |   office (-0.1169)  |
|   zioski (0.5413)    |  official (-0.1179) |
| portabello (0.5408)  |   leave (-0.1188)   |
|  vonnegut (0.5390)   |  tonight (-0.1189)  |
|  attendant (0.5304)  |    bear (-0.1222)   |
+----------------------+---------------------+

Pre-trained Facebook Model
Word: important
+-------------------------+-----------------------------+
|      Similar Words      |     Least Similar Words     |
+-------------------------+-----------------------------+
|     vitally (0.8139)    |        Star4 (0.2459)       |
|     crucial (0.7996)    |        Du