In [67]:
corpus = [
    "I love NLP",
    "Machine learning is amazing",
    "I love machine learning"
]


In [68]:
#cleaning (lowercase+remove)
import re
def clean_text(text):
    text=text.lower()
    text=re.sub(r"[^a-z\s]",'',text)
    return text

In [69]:
#cleaned corpus
cleaned_corpus=[]
for sent in corpus:
    cleaned_corpus.append(clean_text(sent))

print(cleaned_corpus)


['i love nlp', 'machine learning is amazing', 'i love machine learning']


In [70]:
#tokenization
def tokenization(sentence):
    return sentence.split()

In [71]:
tokens=[]
for sent in cleaned_corpus:
    tokens.append(tokenization(sent))

print(tokens)


[['i', 'love', 'nlp'], ['machine', 'learning', 'is', 'amazing'], ['i', 'love', 'machine', 'learning']]


In [72]:
#stopwords
import nltk
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))

def remove_stopwords(word_list):
    filtered=[]
    for word in word_list:
        if word not in stop_words:
            filtered.append(word)
    return filtered


In [73]:
tokens_no_stop=[]
for word_list in tokens:
    tokens_no_stop.append(remove_stopwords(word_list))

tokens_no_stop

[['love', 'nlp'],
 ['machine', 'learning', 'amazing'],
 ['love', 'machine', 'learning']]

In [93]:
#stemming
from nltk.stem import SnowballStemmer
stemmer=SnowballStemmer('english')

In [75]:
def stem_words(word_list):
    output=[]
    for word in word_list:
        output.append(stemmer.stem(word))
    return output

In [76]:
stemmed_tokens=[]
for word_list in tokens_no_stop:
    stemmed_tokens.append(stem_words(word_list))

print(stemmed_tokens)

[['love', 'nlp'], ['machin', 'learn', 'amaz'], ['love', 'machin', 'learn']]


In [77]:
vocab=[]
for word_list in stemmed_tokens:
    for word in word_list:
        if word not in vocab:
            vocab.append(word)

print(vocab)
print(len(vocab))

['love', 'nlp', 'machin', 'learn', 'amaz']
5


In [78]:
def bow(sentences,vocab):
    bow_dict_list=[]
    bow_vectors=[]

    for sent in sentences:
        freq={}
        for word in vocab:
            freq[word]=0

        for word in sent:
            if word in vocab:
                freq[word]+=1

        bow_dict_list.append(freq)

        row_vector=[]
        for word in vocab:
            row_vector.append(freq[word])

        bow_vectors.append(row_vector)

    return bow_dict_list,bow_vectors


In [79]:
bow_dict,bow_vec=bow(stemmed_tokens,vocab)

In [80]:
bow_dict

[{'love': 1, 'nlp': 1, 'machin': 0, 'learn': 0, 'amaz': 0},
 {'love': 0, 'nlp': 0, 'machin': 1, 'learn': 1, 'amaz': 1},
 {'love': 1, 'nlp': 0, 'machin': 1, 'learn': 1, 'amaz': 0}]

In [81]:
bow_vec

[[1, 1, 0, 0, 0], [0, 0, 1, 1, 1], [1, 0, 1, 1, 0]]

In [82]:
def compute_df(bow_vectors,vocab):
    df={}
    for word in vocab:
        df[word]=0

    for row in bow_vectors:
        for i in range(len(vocab)):
            if row[i]>0:
                df[vocab[i]]+=1

    return df


In [83]:
df=compute_df(bow_vec,vocab)

In [84]:
df

{'love': 2, 'nlp': 1, 'machin': 2, 'learn': 2, 'amaz': 1}

In [85]:
import math
def compute_idf(df,total_docs):
    idf={}
    for word,value in df.items():
        idf[word]=math.log(total_docs/(value+1))

    return idf

In [86]:
idf=compute_idf(df,len(bow_vec))
idf

{'love': 0.0,
 'nlp': 0.4054651081081644,
 'machin': 0.0,
 'learn': 0.0,
 'amaz': 0.4054651081081644}

In [87]:
def compute_tfidf(bow_vectors,vocab,idf):
    tfidf=[]
    for row in bow_vectors:
        tfidf_row=[]
        for i in range(len(vocab)):
            word=vocab[i]
            tf=row[i]
            tfidf_row.append(tf*idf[word])

        tfidf.append(tfidf_row)

    return tfidf

In [88]:
tfidf_vc=compute_tfidf(bow_vec,vocab,idf)
tfidf_vc

[[0.0, 0.4054651081081644, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.4054651081081644],
 [0.0, 0.0, 0.0, 0.0, 0.0]]

In [90]:
for i in range(len(vocab)):
    print(vocab[i])

love
nlp
machin
learn
amaz


In [94]:
def print_tfidf(tfidf_vectors,vocab):
    for i in range(len(tfidf_vectors)):
        print(f'\nSentence {i+1}:')

        row=tfidf_vectors[i]

        for j in range(len(vocab)):
            print(f'{vocab[j]}--> {row[j]}')

            

In [95]:
print_tfidf(tfidf_vc,vocab)


Sentence 1:
love--> 0.0
nlp--> 0.4054651081081644
machin--> 0.0
learn--> 0.0
amaz--> 0.0

Sentence 2:
love--> 0.0
nlp--> 0.0
machin--> 0.0
learn--> 0.0
amaz--> 0.4054651081081644

Sentence 3:
love--> 0.0
nlp--> 0.0
machin--> 0.0
learn--> 0.0
amaz--> 0.0
