## **Importing Libraries**

In [97]:
import pandas as pd
import numpy as np
import gensim
import os
import string
import re
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from sklearn.decomposition import PCA
import plotly.express as px

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# removing punctuation mark
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [17]:
# remove \n in text
def replace_newlines(text):
    return re.sub(r'\n+', ' ', text).strip()

In [25]:
# remove stopwords
stop_words = set(stopwords.words('english'))  # Convert to set for faster lookup

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

In [28]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# **Steps**
- **Importing Data**
- **Tokenize Data (Sentence)**
- **Remove Punctuation
- **Remove newlines**
- **Remove StopWords**
- **Convert into Simple Preprocess**

In [77]:
exclude = string.punctuation
story = []
f = open(os.path.join('data', 'Friends_Transcript.txt'),encoding='utf-8', errors='ignore')
corpus = f.read()
raw_sent = sent_tokenize(corpus)
for sent in raw_sent:
    removed_punc_newline = replace_newlines(remove_punc1(sent))
    processed_sent = simple_preprocess(remove_stopwords(removed_punc_newline.lower()))
    if processed_sent:  # Ensure the sentence isn't empty
                story.append(processed_sent)

In [78]:
print(f"Corpus lenght :  {len(corpus)}")
print(f"Number of sentences : {len(raw_sent)}")
print(f"story length : {len(story)}")

Corpus lenght :  4899189
Number of sentences : 117572
story length : 115646


In [79]:
story

[['one',
  'monica',
  'gets',
  'new',
  'roomate',
  'pilotthe',
  'uncut',
  'version',
  'written',
  'marta',
  'kauffman',
  'david',
  'crane',
  'scene',
  'central',
  'perk',
  'chandler',
  'joey',
  'phoebe',
  'monica'],
 ['monica', 'theres', 'nothing', 'tell'],
 ['hes', 'guy', 'work'],
 ['joey', 'cmon', 'youre', 'going', 'guy'],
 ['theres', 'gotta', 'something', 'wrong'],
 ['chandler', 'right', 'joey', 'nice'],
 ['hump'],
 ['hump', 'hairpiece'],
 ['phoebe', 'wait', 'eat', 'chalk'],
 ['stare', 'bemused'],
 ['phoebe', 'cause', 'dont', 'want', 'go', 'went', 'carl', 'oh'],
 ['monica', 'okay', 'everybody', 'relax'],
 ['even', 'date'],
 ['two', 'people', 'going', 'dinner', 'sex'],
 ['chandler', 'sounds', 'like', 'date'],
 ['time',
  'lapse',
  'chandler',
  'alright',
  'im',
  'back',
  'high',
  'school',
  'im',
  'standing',
  'middle',
  'cafeteria',
  'realize',
  'totally',
  'naked'],
 ['oh', 'yeah'],
 ['dream'],
 ['chandler', 'look', 'realize', 'theres', 'phone'],
 ['j

In [80]:
model = gensim.models.Word2Vec(
    window=5,
    min_count=3, # accept those sentence which have minimum 2 words
    workers=4,
)

In [81]:
model.build_vocab(story)
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(1935714, 2523405)

In [83]:
model.wv.most_similar('monica')

[('rachel', 0.9022611975669861),
 ('joey', 0.8894322514533997),
 ('phoebe', 0.8859913945198059),
 ('chandler', 0.8694720268249512),
 ('ross', 0.8503302931785583),
 ('richard', 0.8176605105400085),
 ('janice', 0.8173155188560486),
 ('director', 0.7943333983421326),
 ('charlie', 0.793239176273346),
 ('prepares', 0.7930054664611816)]

In [84]:
model.wv.doesnt_match(['monica', 'ross', 'rachel', 'god'])

'god'

In [86]:
model.wv['god']

array([ 0.12647986,  0.6324237 ,  0.23100482,  0.27130836, -0.54711837,
       -0.5107276 , -0.19341524,  0.53490585, -0.26771402, -0.992168  ,
        0.4614105 , -0.48921087, -0.11243579, -0.15063037,  0.03743075,
       -0.41749555, -0.75178134, -0.29678714,  0.87452304,  0.27141798,
        0.26156974,  0.629999  ,  0.70012945, -0.49424198, -0.13978851,
       -0.15837479, -0.02515627,  0.1013075 , -1.2913133 , -0.10765205,
        0.38299158, -0.41049555, -0.05912616, -0.64592266,  0.49805903,
        0.5199852 , -0.34234056, -0.51566625, -0.5690701 , -1.2143178 ,
       -0.7700002 ,  0.27824387, -0.67997146, -0.5456082 ,  1.251073  ,
        0.14815335, -0.2833409 ,  0.04296072,  1.0374486 , -0.06136849,
        0.19226089,  0.5729111 , -0.4096994 , -0.6800472 ,  0.93205684,
        0.20969357, -0.0200164 ,  0.33773163, -0.8448624 , -0.347364  ,
       -0.3375328 ,  0.6526577 , -0.18833412, -0.37180337, -1.1175246 ,
        0.29297027, -0.25829402, -0.1489798 , -0.59162116,  0.47

In [87]:
model.wv.similarity('monica', 'rachel')

0.90226114

In [88]:
model.wv.similarity('monica','prepares')

0.79300547

In [89]:
model.wv.get_normed_vectors()

array([[-0.11973665,  0.09132571,  0.04844841, ..., -0.06417389,
         0.03241246, -0.00636536],
       [-0.09749118,  0.09035764,  0.01769856, ..., -0.1217326 ,
         0.01883168, -0.03519715],
       [-0.04357055,  0.00378253, -0.0143761 , ..., -0.0910714 ,
         0.05450577, -0.06092392],
       ...,
       [-0.07375094,  0.02880119,  0.0659207 , ..., -0.06379627,
         0.0873814 , -0.01232543],
       [-0.133145  ,  0.15163696,  0.00059921, ..., -0.12095088,
         0.01514413,  0.0160448 ],
       [-0.08627254,  0.0618843 ,  0.0827368 , ..., -0.15616688,
        -0.02667997,  0.08258495]], dtype=float32)

In [90]:
model.wv.get_normed_vectors().shape

(8252, 100)

In [91]:
y = model.wv.index_to_key

In [92]:
y

['ross',
 'rachel',
 'monica',
 'chandler',
 'joey',
 'phoebe',
 'oh',
 'im',
 'yeah',
 'know',
 'okay',
 'well',
 'dont',
 'hey',
 'right',
 'like',
 'scene',
 'get',
 'gonna',
 'youre',
 'go',
 'thats',
 'one',
 'really',
 'look',
 'think',
 'uh',
 'yknow',
 'see',
 'mean',
 'back',
 'got',
 'want',
 'come',
 'good',
 'god',
 'cant',
 'guys',
 'going',
 'would',
 'sorry',
 'hi',
 'time',
 'little',
 'ok',
 'great',
 'hes',
 'say',
 'tell',
 'guy',
 'yes',
 'didnt',
 'door',
 'ill',
 'something',
 'room',
 'could',
 'shes',
 'love',
 'thing',
 'starts',
 'looks',
 'rachels',
 'take',
 'still',
 'apartment',
 'wait',
 'make',
 'way',
 'maybe',
 'said',
 'phone',
 'us',
 'umm',
 'mr',
 'enters',
 'man',
 'two',
 'theres',
 'whats',
 'around',
 'much',
 'never',
 'entering',
 'ive',
 'sure',
 'big',
 'wanna',
 'thank',
 'goes',
 'wow',
 'away',
 'first',
 'believe',
 'people',
 'need',
 'looking',
 'thought',
 'talk',
 'please',
 'lets',
 'give',
 'walks',
 'let',
 'woman',
 'joeys',
 'e

# **Reducing dimension to see visual graph**

In [94]:
pca = PCA(n_components=3)
X = pca.fit_transform(model.wv.get_normed_vectors())

In [95]:
X.shape

(8252, 3)

In [98]:
fig = px.scatter_3d(X[:100], x = 0, y=1, z=2, color=y[:100])
fig.show()