### Downloading and Setting Up the  NLTK WordNet Data

In [None]:
import nltk
import subprocess
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

### Imports and Downloads

In [None]:
import re
import spacy
import gensim
import random
import fasttext
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from huggingface_hub import hf_hub_download
from gensim.models.fasttext import FastText

nltk.download('punkt')
nltk.download('stopwords')

Lemmatizer = spacy.load("en_core_web_sm")

### Defining Preprocessing Functions

In [3]:
def Clean_Text(Text):
    Cleaned_Text = re.sub(r'[^\w\s]', '', Text)
    Cleaned_Text = re.sub(r'\d+', '', Cleaned_Text)
    Cleaned_Text = Cleaned_Text.lower()
    return Cleaned_Text

def Tokenize_Text(Text):
    tokens = word_tokenize(Text)
    return tokens

def Lemmatize_Text(tokens):
    Lemmatized_Tokens = []
    for token in tokens:
        if Lemmatizer(token)[0].lemma_ != '-PRON-':
            Lemmatized_Token = Lemmatizer(token)[0].lemma_
        else:
            Lemmatized_Token = token
        Lemmatized_Tokens.append(Lemmatized_Token)
    return Lemmatized_Tokens

def Remove_Stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    custom_stop_words = ['I']
    stop_words.update(custom_stop_words)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

### Importing Yelp Dataset

In [4]:
# Path to Yelp Dataset
Yelp_Path = "/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json"

# Read Yelp Dataset
Yelp_Data = pd.read_json(Yelp_Path, lines=True)

# Extract the text column from the Yelp Dataset
Tip_Text = Yelp_Data['text']

# Limit the number of samples to 10,000
Sub_Tip_Text = Tip_Text[:10000]

### Perform Text Preprocessing on the Extracted Text from Yelp Dataset

In [5]:
Filtered_Docs = []
for i in range (len(Sub_Tip_Text)):
    Cleaned_Text = Clean_Text(Sub_Tip_Text[i])
    Tokens = Tokenize_Text(Cleaned_Text)
    Lemmatized_Tokens = Lemmatize_Text(Tokens)
    Filtered_Tokens = Remove_Stopwords(Lemmatized_Tokens)
    Filtered_Words = [word for word in Filtered_Tokens if len(word) > 2] # Remove words with less than 3 characters
    Filtered_Docs.append(Filtered_Words)

### Training and Saving the FastText Model

In [10]:
FastText_Model = FastText(sentences=Filtered_Docs, vector_size=50, window=3, min_count=3, workers=4, sg=1)
FastText_Model.save("FastText_Model")

### Loading the FastText Model

In [11]:
FastTextModel = gensim.models.fasttext.FastText.load("FastText_Model")

### Selecting 500 More Samples Randomly to Compare them With the Previous 10000 Sample

In [5]:
Preprocessed_Words = []
random.seed(42)
Sub_Test = random.sample(Tip_Text[10001:].tolist(), 500) # Randomly select 500 text entries (samples) from `Tip_Text` starting from entry 10001

# Perform text preprocessing on the sampled text entries
for i in range (len(Sub_Test)):
    Cleaned_Text = Clean_Text(Sub_Test[i])
    Tokens = Tokenize_Text(Cleaned_Text)
    Lemmatized_Tokens = Lemmatize_Text(Tokens)
    Filtered_Tokens = Remove_Stopwords(Lemmatized_Tokens)
    Filtered_Words = [word for word in Filtered_Tokens if len(word) > 2] # Remove words with less than 3 characters
    Preprocessed_Words.append(Filtered_Words)

# Flatten the list of sentences into a single list of words
Words = [word for sentence in Preprocessed_Words for word in sentence]
random.seed(42)
Test_Words = random.sample(Words, 50) # Randomly select 50 words from the list `Words`

### Using the Trained FastText Model to Determine the Top 10 Similar and Opposite Words for Each Word

In [14]:
for word in Test_Words:
    print(f"Word: {(word)}")
    
    # Retrieve the top 10 most similar words
    Similar_Words = FastTextModel.wv.most_similar(positive=word, topn=10)
    print(f"\nThe top 10 similar words for the word ({word}):\n")
    for similar_word,_ in Similar_Words:
        print(similar_word)
        
    # Calculate the similarity between the target word and all other words
    similarity_scores = {w: FastTextModel.wv.similarity(word, w) for w in Test_Words if w != word}
    
    # Sort the words based on their similarity scores and select the top 10 least similar words
    Opposite_Words = sorted(similarity_scores, key=similarity_scores.get, reverse=True)[:10]
    
    print(f"\nThe top 10 opposite words for the word ({word}):\n")
    for opposite_word in Opposite_Words:
        print(opposite_word)
    
    print()

Word: check

The top 10 similar words for the word (check):

checkin
checkout
ticket
cause
case
ill
mill
shake
milkshake
fill

The top 10 opposite words for the word (check):

phone
milk
shuffleboard
wedding
heck
para
orange
intoxicate
asparagus
accolade

Word: replace

The top 10 similar words for the word (replace):

face
race
seafood
atm
heat
always
pace
real
niece
blood

The top 10 opposite words for the word (replace):

always
beer
place
slice
breast
pizza
amazing
coffee
music
dessert

Word: gmc

The top 10 similar words for the word (gmc):

happy
ice
staff
crappy
service
nice
happen
rice
pimms
advice

The top 10 opposite words for the word (gmc):

line
slice
heck
make
take
tre
check
rude
shuffleboard
back

Word: back

The top 10 similar words for the word (back):

terrible
outback
definitely
low
attack
lack
mention
background
rude
stock

The top 10 opposite words for the word (back):

rude
coop
tre
receive
southern
festival
dessert
vegetarian
like
accolade

Word: line

The top 10

### Importing and Loading the Pre-trained Facebook FastText Model

In [None]:
model_path = hf_hub_download(repo_id="facebook/fasttext-english-nearest-neighbors", filename="model.bin")
Facebook_Model = fasttext.load_model(model_path)

### Using the Pre-trained FastText Model to Determine the Top 10 Similar and Opposite Words for Each Word

In [21]:
for word in Test_Words:
    print(f"Word: {(word)}")
    
    # Retrieve the top 10 most similar words
    Similar_Words = Facebook_Model.get_nearest_neighbors(word, k=10)
    print(f"\nThe Top 10 Similar Words for The Word '{word}':\n")
    for similarity, similar_word in Similar_Words:
        print(similar_word)
        
    Opposite_Words = Facebook_Model.get_nearest_neighbors(word, k=len(Test_Words))
    Sorted_Opposite_Words = sorted(Opposite_Words, key=lambda x: x[0])
    print(f"\nThe Top 10 Opposite Words for The Word '{word}':\n")
    for similarity, opposite_word in Sorted_Opposite_Words[:10]:
        print(f"{opposite_word}: {similarity}")
        
    print()

Word: check

The Top 10 Similar Words for The Word 'check':

checking
Check
checked
chec
re-check
chekc
recheck
double-check
ckeck
doublecheck

The Top 10 Opposite Words for The Word 'check':

checkes: 0.5088723301887512
--check: 0.5102982521057129
checking.: 0.5119532346725464
Checked: 0.5119572877883911
out.Also: 0.5133094191551208
check.And: 0.5134046673774719
.check: 0.5149261355400085
CHeck: 0.5169249176979065
check.As: 0.5181552171707153
check.For: 0.5181910395622253

Word: replace

The Top 10 Similar Words for The Word 'replace':

replacing
replaced
Replace
repalce
replacement
repace
replaces
replce
Replacing
supplant

The Top 10 Opposite Words for The Word 'replace':

.Replace: 0.48206964135169983
replace.I: 0.48231473565101624
fix: 0.4883595108985901
repla: 0.4893805980682373
replacement.The: 0.4906591475009918
re-engineer: 0.49219995737075806
re-build: 0.4923125207424164
substitute: 0.4927825629711151
dismantle: 0.49668416380882263
re-tool: 0.4967217743396759

Word: gmc

The 