In [1]:
! pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading reportlab-4.2.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet, reportlab
Successfully installed chardet-5.2.0 reportlab-4.2.0


In [2]:
import numpy as np 
import pandas as pd
import spacy
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")
from gensim.models.fasttext import FastText
from gensim.models.fasttext import load_facebook_model
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from tabulate import tabulate
from gensim.models import Word2Vec
import random
import os
import re

### Download Pretrained Model

In [3]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
! gunzip "cc.en.300.bin.gz"

--2024-04-19 12:08:11--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.78.72, 18.164.78.121, 18.164.78.81, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.78.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: 'cc.en.300.bin.gz'


2024-04-19 12:08:27 (273 MB/s) - 'cc.en.300.bin.gz' saved [4503593528/4503593528]



### Access the 'text' column which contains the tips on yelp dataset

In [4]:
tips_df = pd.read_json('/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json', lines=True)
tips_text = tips_df['text']

In [5]:
print(tips_text.head())

0                       Avengers time with the ladies.
1    They have lots of good deserts and tasty cuban...
2               It's open even when you think it isn't
3                            Very decent fried chicken
4               Appetizers.. platter special for lunch
Name: text, dtype: object


In [6]:
tips_text.shape

(908915,)

In [7]:
corpus = tips_text.head(10000)

In [8]:
max_length = max(len(sentence.split()) for sentence in corpus)
max_length

98

In [9]:
def clean_text(document):
    clean_words = []
   # Process the text using spaCy
    doc = nlp(document)
    # Extract lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in doc] #return words to their base or dictionary form, known as the lemma.
    # Join the lemmatized tokens into a sentence
    cleaned_doc = ' '.join(lemmatized_tokens)
    cleaned_doc = cleaned_doc.lower()
    # Remove links and emails
    cleaned_doc = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', cleaned_doc)
    # Remove any thing except words, numbers, and space
    cleaned_doc = re.sub(r'[^\w\s]', '', cleaned_doc)
    # Remove numbers
    cleaned_doc = re.sub(r'[0-9]', '', cleaned_doc)
    # Remove emojis and non ascii characters
    cleaned_doc = re.sub(r'[^\x00-\x7F]+', '', cleaned_doc)
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    cleaned_doc = ' '.join([word for word in cleaned_doc.split() if word not in stop_words])
    cleaned_doc = ' '.join([word for word in cleaned_doc.split() if (len(word) > 3)])
    # Split sentences to words
    clean_words.extend(cleaned_doc.split())
    return  clean_words

In [10]:
clean_text = [clean_text(sentence) for sentence in corpus]

In [11]:
clean_text[0]

['avenger', 'time', 'lady']

### Train Fast Text Model

In [12]:
def train_Fasttext(sentences,embedding_size,window_size,min_word,down_sampling,epochs,Save_model_filename):
    fast_Text_model = FastText(sentences,
    vector_size=embedding_size, # Dimensionality of the word vectors. ,
    window=window_size,
    min_count=min_word, # The model ignores all words with total frequency lower than this.
    sample=down_sampling, # threshold which higher-frequency words are randomly down sampled
    workers = 4, # Num threads to train the model (faster training with multicore comp.)
    sg=1, # Training algorithm: skip-gram if sg=1, otherwise CBOW.
    epochs=epochs) # Number of iterations (epochs) over the corpus

    fast_Text_model.save(Save_model_filename) # Save fastText gensim model

In [13]:
embedding_size = 300
window_size = 3
min_word = 1
down_sampling = 1e-2
epochs=300

train_Fasttext(clean_text,embedding_size,window_size,min_word,down_sampling,epochs,"Custom_FastText")

In [14]:
fast_text_model = Word2Vec.load("/kaggle/working/Custom_FastText")

In [15]:
print(fast_text_model.wv.most_similar(positive=["else"], topn=10))

[('chelse', 0.6487493515014648), ('kelsea', 0.4848681390285492), ('somewhere', 0.48241910338401794), ('soho', 0.48173123598098755), ('dazs', 0.47635185718536377), ('pulse', 0.4740001857280731), ('haagen', 0.47257331013679504), ('elephant', 0.47225841879844666), ('anywhere', 0.4679949879646301), ('false', 0.46713611483573914)]


In [16]:
words = list(fast_text_model.wv.key_to_index)

### Pretrained Model

In [17]:
pretrained_facebook = load_facebook_model('/kaggle/working/cc.en.300.bin')

### Results in custom Model and pretrained Model

In [43]:
def create_pdf(tables):
    doc = SimpleDocTemplate("word_analysis.pdf", pagesize=letter)
    doc.build(tables)
    
tables = []
c=0
for _ in range(10):
    data = []
    word = random.choice(words)
    words.remove(word)
    c += 1
    print(f"Analyzing word: {word}\n")
        
    data.append(['Analyzing word: ' + word])
    ######## custom model ########
    similar_words_custom = fast_text_model.wv.most_similar(positive=[word], topn=10)
    opposite_words_custom = fast_text_model.wv.most_similar(negative=[word], topn=10)
    
    data.append(['Top 10 similar words (custom model):'])
    data.append(['Similar Word', 'Similarity'])
    data.extend(similar_words_custom)
    data.append([''])
    
    data.append(['Top 10 dissimilar words (custom model):'])
    data.append(['Dissimilar Word', 'Dissimilarity'])
    data.extend(opposite_words_custom)
    data.append([''])
    
    ######## pretrained model ########
    similar_words_pretrained = pretrained_facebook.wv.most_similar(positive=[word], topn=10)
    opposite_words_pretrained = pretrained_facebook.wv.most_similar(negative=[word], topn=10)
    
    data.append(['Top 10 similar words (pre-trained model):'])
    data.append(['Similar Word', 'Similarity'])
    data.extend(similar_words_pretrained)
    data.append([''])
    
    data.append(['Top 10 dissimilar words (pre-trained model):'])
    data.append(['Dissimilar Word', 'Dissimilarity'])    
    data.extend(opposite_words_pretrained)
    data.append([''])
    
    table = Table(data)

    table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, 0), 14),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
        ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
        ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
        ('FONTSIZE', (0, 1), (-1, -1), 12),
        ('LEFTPADDING', (0, 1), (-1, -1), 12),
        ('RIGHTPADDING', (0, 1), (-1, -1), 12),
        ('BOTTOMPADDING', (0, 1), (-1, -1), 12),
    ]))
    
    tables.append(table)
    tables.append(Spacer(1, 20))


    # Creating tables for each category
    table_custom_similar = tabulate(similar_words_custom, headers=['Similar Word', 'Similarity'], tablefmt='github')
    table_custom_opposite = tabulate(opposite_words_custom, headers=['Opposite Word', 'Similarity'], tablefmt='github')
    table_pretrained_similar = tabulate(similar_words_pretrained, headers=['Similar Word', 'Similarity'], tablefmt='github')
    table_pretrained_opposite = tabulate(opposite_words_pretrained, headers=['Opposite Word', 'Similarity'], tablefmt='github')
    if c == 10:
        break
create_pdf(tables)
print("PDF Generated Successfully!")

Analyzing word: cycling

Analyzing word: medallion

Analyzing word: peppercorn

Analyzing word: franklin

Analyzing word: fabulous

Analyzing word: probably

Analyzing word: accident

Analyzing word: ginger

Analyzing word: ethnic

Analyzing word: information

PDF Generated Successfully!
