# Bag of Words

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({"text":["The dog makes child happy",
                           "The child makes dog happy",
                           "What makes child happy?",
                           "What makes dog happy?"],
                   "output":[1,1,0,0]})
df

Unnamed: 0,text,output
0,The dog makes child happy,1
1,The child makes dog happy,1
2,What makes child happy?,0
3,What makes dog happy?,0


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [4]:
bag_of_words = cv.fit_transform(df["text"])

In [6]:
# to print vocabulary
print(cv.vocabulary_)

{'the': 4, 'dog': 1, 'makes': 3, 'child': 0, 'happy': 2, 'what': 5}


In [9]:
bag_of_words.toarray()

array([[1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 1],
       [0, 1, 1, 1, 0, 1]])

In [10]:
print(bag_of_words.toarray())

[[1 1 1 1 1 0]
 [1 1 1 1 1 0]
 [1 0 1 1 0 1]
 [0 1 1 1 0 1]]


In [13]:
df['text'][2]

'What makes child happy?'

In [14]:
cv.transform([df['text'][2]]).toarray()

array([[1, 0, 1, 1, 0, 1]])

# N-grams

In [15]:
df = pd.DataFrame({"text":["The dog makes child happy",
                           "The child makes dog happy",
                           "What makes child happy?",
                           "What makes dog happy?"],
                   "output":[1,1,0,0]})

In [23]:
#BI gram
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [17]:
bag_of_words = cv.fit_transform(df["text"])

In [18]:
# to print vocabulary
print(cv.vocabulary_)

{'the dog': 7, 'dog makes': 3, 'makes child': 4, 'child happy': 0, 'the child': 6, 'child makes': 1, 'makes dog': 5, 'dog happy': 2, 'what makes': 8}


In [19]:
bag_of_words.toarray()

array([[1, 0, 0, 1, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 1]])

In [20]:
print(bag_of_words.toarray())

[[1 0 0 1 1 0 0 1 0]
 [0 1 1 0 0 1 1 0 0]
 [1 0 0 0 1 0 0 0 1]
 [0 0 1 0 0 1 0 0 1]]


In [21]:
df['text'][2]

'What makes child happy?'

In [22]:
cv.transform([df['text'][2]]).toarray()

array([[1, 0, 0, 0, 1, 0, 0, 0, 1]])

In [24]:
# Ti gram
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))

In [25]:
bag_of_words = cv.fit_transform(df["text"])

In [26]:
# to print vocabulary
print(cv.vocabulary_)

{'the dog makes': 5, 'dog makes child': 1, 'makes child happy': 2, 'the child makes': 4, 'child makes dog': 0, 'makes dog happy': 3, 'what makes child': 6, 'what makes dog': 7}


In [27]:
bag_of_words.toarray()

array([[0, 1, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 1]])

In [28]:
print(bag_of_words.toarray())

[[0 1 1 0 0 1 0 0]
 [1 0 0 1 1 0 0 0]
 [0 0 1 0 0 0 1 0]
 [0 0 0 1 0 0 0 1]]


In [29]:
df['text'][2]

'What makes child happy?'

In [30]:
cv.transform([df['text'][2]]).toarray()

array([[0, 0, 1, 0, 0, 0, 1, 0]])

# TFIDF (Term frequency - inverse document frequency)

In [32]:
df = pd.DataFrame({"text":["The dog makes child happy",
                           "The child makes dog happy",
                           "What makes child happy?",
                           "What makes dog happy?"],
                   "output":[1,1,0,0]})
df

Unnamed: 0,text,output
0,The dog makes child happy,1
1,The child makes dog happy,1
2,What makes child happy?,0
3,What makes dog happy?,0


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()

In [40]:
vector = tfid.fit_transform(df['text']).toarray()
vector

array([[0.45349057, 0.45349057, 0.37075826, 0.37075826, 0.56015108,
        0.        ],
       [0.45349057, 0.45349057, 0.37075826, 0.37075826, 0.56015108,
        0.        ],
       [0.50881901, 0.        , 0.41599288, 0.41599288, 0.        ,
        0.6284927 ],
       [0.        , 0.50881901, 0.41599288, 0.41599288, 0.        ,
        0.6284927 ]])

In [41]:
print(tfid.idf_)

[1.22314355 1.22314355 1.         1.         1.51082562 1.51082562]


# Word2Vec

In [42]:
import gensim
import os

In [43]:
!pip install --upgrade gensim --user




In [44]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [46]:
data = "/content/simpsons_dataset.csv"


In [53]:
# Load the CSV file into a DataFrame
df = pd.read_csv('/content/data.csv')
df

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
...,...,...
158309,Miss Hoover,I'm back.
158310,Miss Hoover,"You see, class, my Lyme disease turned out to ..."
158311,Miss Hoover,Psy-cho-so-ma-tic.
158312,Ralph Wiggum,Does that mean you were crazy?


In [57]:
df.isnull().sum()

Unnamed: 0,0
raw_character_text,17814
spoken_words,26459


In [58]:
df.dropna(inplace=True)

In [59]:
df.isnull().sum()

Unnamed: 0,0
raw_character_text,0
spoken_words,0


In [60]:
df.duplicated().sum()

5207

In [61]:
df.drop_duplicates(inplace=True)

In [62]:
df.duplicated().sum()

0

In [63]:
df.shape

(126646, 2)

In [66]:
# Replace 'your_text_column' with the name of the column containing text
text_data = df['spoken_words'].tolist()

# Initialize an empty list to hold the sentences
sentence = []

# Process each text entry from the CSV
for corpus in text_data:
    raw_sent = sent_tokenize(corpus)  # Tokenize text into sentences
    for sent in raw_sent:
        sentence.append(simple_preprocess(sent))  # Preprocess and add to the list

sentence[:3]


[['no', 'actually', 'it', 'was', 'little', 'of', 'both'],
 ['sometimes',
  'when',
  'disease',
  'is',
  'in',
  'all',
  'the',
  'magazines',
  'and',
  'all',
  'the',
  'news',
  'shows',
  'it',
  'only',
  'natural',
  'that',
  'you',
  'think',
  'you',
  'have',
  'it'],
 ['where', 'mr', 'bergstrom']]

In [67]:
len(sentence)

204901

In [71]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [72]:
model.build_vocab(sentence)

In [73]:
model.train(sentence, total_examples=model.corpus_count, epochs=model.epochs)


(4738605, 6172800)

In [76]:
model.wv.most_similar('magazines')

[('conversion', 0.9214313626289368),
 ('sheets', 0.9124067425727844),
 ('players', 0.9050449728965759),
 ('ether', 0.9050155878067017),
 ('cane', 0.9039900302886963),
 ('nina', 0.902400553226471),
 ('vast', 0.9021453261375427),
 ('stains', 0.9018569588661194),
 ('lovers', 0.9012829065322876),
 ('germs', 0.9011884331703186)]

In [77]:
model.wv.similarity('natural','disease')

0.87638426

In [78]:
model.wv['deep'].shape

(100,)

In [80]:
vec = model.wv.get_normed_vectors()
vec

array([[-0.04367295,  0.19382648, -0.00710748, ...,  0.13561474,
         0.10295776,  0.03140843],
       [-0.01502088,  0.21162169,  0.08084115, ...,  0.1329336 ,
        -0.13974455, -0.00045861],
       [ 0.09317227,  0.02614755, -0.05969175, ..., -0.0312504 ,
         0.15366982, -0.12815921],
       ...,
       [ 0.13484345, -0.1489134 ,  0.04130427, ...,  0.00352725,
         0.08982968, -0.02108248],
       [-0.05007254,  0.00080226, -0.09245366, ..., -0.12916182,
         0.00122624, -0.04286993],
       [-0.06972565,  0.1579368 ,  0.10659361, ...,  0.0124564 ,
         0.17923728, -0.08780779]], dtype=float32)

In [81]:

model.wv.get_normed_vectors().shape

(22541, 100)

In [82]:

y = model.wv.index_to_key

In [83]:
len(y)

22541

In [84]:

y

['you',
 'the',
 'to',
 'it',
 'and',
 'that',
 'of',
 'in',
 'my',
 'we',
 'is',
 'this',
 'me',
 'your',
 'for',
 'what',
 're',
 'oh',
 'on',
 'can',
 'have',
 'but',
 'be',
 'no',
 'don',
 'all',
 'just',
 'with',
 'do',
 'are',
 'll',
 'well',
 'now',
 'not',
 'like',
 'so',
 'he',
 'was',
 'get',
 'here',
 'one',
 'there',
 'know',
 've',
 'up',
 'at',
 'out',
 'they',
 'hey',
 'if',
 'homer',
 'right',
 'go',
 'got',
 'how',
 'bart',
 'our',
 'about',
 'let',
 'from',
 'who',
 'good',
 'see',
 'uh',
 'as',
 'will',
 'think',
 'yeah',
 'want',
 'man',
 'why',
 'an',
 'look',
 'marge',
 'gonna',
 'back',
 'little',
 'dad',
 'time',
 'some',
 'when',
 'him',
 'could',
 'did',
 'come',
 'us',
 'okay',
 'take',
 'never',
 'his',
 'simpson',
 'say',
 'lisa',
 'she',
 'where',
 'make',
 'would',
 'or',
 'by',
 'more',
 'really',
 'been',
 'two',
 'yes',
 'her',
 'love',
 'going',
 'has',
 'way',
 'down',
 'off',
 'only',
 'then',
 'something',
 'too',
 'am',
 'were',
 'boy',
 'mr',
 'p

In [85]:
from sklearn.decomposition import PCA

In [86]:

pca = PCA(n_components=3)

In [87]:

X = pca.fit_transform(model.wv.get_normed_vectors())

In [88]:

X

array([[ 8.1867570e-01,  1.7170604e-02, -1.3848767e-03],
       [ 5.1030743e-01,  5.0984520e-01,  2.7120915e-01],
       [ 7.7053320e-01,  1.2716520e-01,  1.4616109e-01],
       ...,
       [ 6.6000366e-01, -1.5335263e-01,  1.0981601e-02],
       [ 1.6197681e-02, -1.6847283e-01, -6.7775279e-02],
       [ 8.5070789e-02, -5.6729093e-04, -1.2107987e-02]], dtype=float32)

In [89]:
X.shape

(22541, 3)

In [90]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()