In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
X = (
    "Computers can analyze text to find sentiment, but they don't really understand meaning. ",
    "They do it using vectors and matrices",
    "Computers can process massive amounts of text data",
)
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)
print(vectorizer.vocabulary_)
print(X_vec.todense())

{'computers': 5, 'can': 4, 'analyze': 1, 'text': 18, 'to': 20, 'find': 9, 'sentiment': 17, 'but': 3, 'they': 19, 'don': 8, 'really': 16, 'understand': 21, 'meaning': 13, 'do': 7, 'it': 10, 'using': 22, 'vectors': 23, 'and': 2, 'matrices': 12, 'process': 15, 'massive': 11, 'amounts': 0, 'of': 14, 'data': 6}
[[0 1 0 1 1 1 0 0 1 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0]
 [0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1]
 [1 0 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0]]


In [6]:
# BoW matrix from scratch using NLTK

import nltk

nltk.download("stopwords")
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np


def preprocess(
    texts,
    keep_list=[],
    stemming=False,
    stem_type=None,
    lemmatization=False,
    remove_stopwords=False,
):
    if stemming:
        if stem_type == "Porter":
            stemmer = PorterStemmer()
        elif stem_type == "Snowball":
            stemmer = SnowballStemmer("english")
        else:
            raise ValueError("Invalid stemmer type")
    else:
        stemmer = None

    if lemmatization:
        lemmatizer = WordNetLemmatizer()
    else:
        lemmatizer = None

    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        for word in keep_list:
            stop_words.discard(word)

    preprocessed_text = []

    for text in texts:
        words = re.findall(r"\w+", text.lower())
        if stemming and stem_type:
            words = [stemmer.stem(word) for word in words]
        elif lemmatization:
            words = [lemmatizer.lemmatize(word) for word in words]

        if remove_stopwords:
            words = [word for word in words if word not in stop_words]

        preprocessed_text.append(" ".join(words))

    return preprocessed_text


common_dot_words = ["U.S.", "U.N.", "E.U.", "U.K."]

sentences = [
    "We are reading about Natural Language Processing Here",
    "Natural Language Processing making computers understand language",
    "The field of Natural Language Processing is evolving every day",
]

sentence_list = list(sentences)

preprocessed_corpus = preprocess(
    sentence_list,
    keep_list=common_dot_words,
    stemming=False,
    stem_type=None,
    lemmatization=True,
    remove_stopwords=True,
)
# print(preprocessed_corpus)

set_of_words = set()
for sentence in preprocessed_corpus:
    for word in sentence.split():
        set_of_words.add(word)
vocab = list(set_of_words)
vocab.sort()
print(vocab)

position = {word: i for i, word in enumerate(vocab)}

bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab)))
for i, sentence in enumerate(preprocessed_corpus):
    for word in sentence.split():
        bow_matrix[i][position[word]] += 1
print(bow_matrix)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jasleen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jasleen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['computer', 'day', 'every', 'evolving', 'field', 'language', 'making', 'natural', 'processing', 'reading', 'understand']
[[0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0.]
 [1. 0. 0. 0. 0. 2. 1. 1. 1. 0. 1.]
 [0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0.]]


In [7]:
# Using CountVectorizer for BoW

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)
print(vectorizer.vocabulary_)
print(bow_matrix.toarray())

{'reading': 9, 'natural': 7, 'language': 5, 'processing': 8, 'making': 6, 'computer': 0, 'understand': 10, 'field': 4, 'evolving': 3, 'every': 2, 'day': 1}
[[0 0 0 0 0 1 0 1 1 1 0]
 [1 0 0 0 0 2 1 1 1 0 1]
 [0 1 1 1 1 1 0 1 1 0 0]]


In [8]:
# Using CountVectorizer for BoW with n-grams

vectorizer_ngram_range = CountVectorizer(analyzer="word", ngram_range=(1, 3))
bow_matrix_ngram = vectorizer_ngram_range.fit_transform(preprocessed_corpus)
print(vectorizer_ngram_range.vocabulary_)
print(bow_matrix_ngram.toarray())

{'reading': 27, 'natural': 19, 'language': 12, 'processing': 22, 'reading natural': 28, 'natural language': 20, 'language processing': 13, 'reading natural language': 29, 'natural language processing': 21, 'making': 16, 'computer': 0, 'understand': 30, 'processing making': 25, 'making computer': 17, 'computer understand': 1, 'understand language': 31, 'language processing making': 15, 'processing making computer': 26, 'making computer understand': 18, 'computer understand language': 2, 'field': 9, 'evolving': 6, 'every': 4, 'day': 3, 'field natural': 10, 'processing evolving': 23, 'evolving every': 7, 'every day': 5, 'field natural language': 11, 'language processing evolving': 14, 'processing evolving every': 24, 'evolving every day': 8}
[[0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 2 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1]
 [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0]]


In [13]:
# Using CountVectorizer for BoW with max_features

vectorizer_max_features = CountVectorizer(
    analyzer="word", ngram_range=(1, 3), max_features=6
)
bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)
print(vectorizer_max_features.vocabulary_)
print(bow_matrix_max_features.toarray())

{'natural': 2, 'language': 0, 'processing': 5, 'natural language': 3, 'language processing': 1, 'natural language processing': 4}
[[1 1 1 1 1 1]
 [2 1 1 1 1 1]
 [1 1 1 1 1 1]]


In [14]:
# Using TF-IDF for BoW

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_corpus)
print(vectorizer.vocabulary_)
print(tfidf_matrix.toarray())
print("\n The shape of TF-IDF matrix is: ", tfidf_matrix.shape)

{'reading': 9, 'natural': 7, 'language': 5, 'processing': 8, 'making': 6, 'computer': 0, 'understand': 10, 'field': 4, 'evolving': 3, 'every': 2, 'day': 1}
[[0.         0.         0.         0.         0.         0.41285857
  0.         0.41285857 0.41285857 0.69903033 0.        ]
 [0.4431132  0.         0.         0.         0.         0.52341958
  0.4431132  0.26170979 0.26170979 0.         0.4431132 ]
 [0.         0.44514923 0.44514923 0.44514923 0.44514923 0.26291231
  0.         0.26291231 0.26291231 0.         0.        ]]

 The shape of TF-IDF matrix is:  (3, 11)


In [15]:
# Using TF-IDF for BoW with l1 norm

vectorizer_l1_norm = TfidfVectorizer(norm="l1")
tfidf_matrix_l1_norm = vectorizer_l1_norm.fit_transform(preprocessed_corpus)
print(vectorizer_l1_norm.vocabulary_)
print(tfidf_matrix_l1_norm.toarray())
print("\n The shape of TF-IDF matrix is: ", tfidf_matrix_l1_norm.shape)

{'reading': 9, 'natural': 7, 'language': 5, 'processing': 8, 'making': 6, 'computer': 0, 'understand': 10, 'field': 4, 'evolving': 3, 'every': 2, 'day': 1}
[[0.         0.         0.         0.         0.         0.21307663
  0.         0.21307663 0.21307663 0.3607701  0.        ]
 [0.18648142 0.         0.         0.         0.         0.22027787
  0.18648142 0.11013893 0.11013893 0.         0.18648142]
 [0.         0.17325473 0.17325473 0.17325473 0.17325473 0.10232703
  0.         0.10232703 0.10232703 0.         0.        ]]

 The shape of TF-IDF matrix is:  (3, 11)


In [16]:
# Using TF-IDF for BoW with n-grams, l2 norm and max_features

vectorizer_ngram_max_features = TfidfVectorizer(
    norm="l2", analyzer="word", ngram_range=(1, 3), max_features=6
)
tfidf_matrix_ngram_max_features = vectorizer_ngram_max_features.fit_transform(
    preprocessed_corpus
)
print(vectorizer_ngram_max_features.vocabulary_)
print(tfidf_matrix_ngram_max_features.toarray())
print("\n The shape of TF-IDF matrix is: ", tfidf_matrix_ngram_max_features.shape)

{'natural': 2, 'language': 0, 'processing': 5, 'natural language': 3, 'language processing': 1, 'natural language processing': 4}
[[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]
 [0.66666667 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333]
 [0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]

 The shape of TF-IDF matrix is:  (3, 6)


In [17]:
# Implementing cosine similarity


def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (
        np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2))
    )


for i in range(bow_matrix.shape[0]):
    print(
        f"The cosine similarity between the first document and the {i+1}th document is {cosine_similarity(bow_matrix[0], bow_matrix[i])}"
    )

The cosine similarity between the first document and the 1th document is 1.0
The cosine similarity between the first document and the 2th document is 0.6666666666666666
The cosine similarity between the first document and the 3th document is 0.5669467095138409


In [24]:
# One hot encoding

from sklearn.preprocessing import OneHotEncoder
from seaborn import load_dataset

df = load_dataset('penguins')
ohe = OneHotEncoder()
transformed = ohe.fit_transform(df[['island']])
print(transformed.toarray())

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [30]:
# Basic Chatbot

import ast
import gzip
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

questions = []
answers = []
with gzip.open('qa_Electronics.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        data = ast.literal_eval(line)
        questions.append(data['question'].lower())
        answers.append(data['answer'].lower())

vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(questions)
tfidf = TfidfTransformer(norm='l2')
X_tfidf = tfidf.fit_transform(X_vec)

def conversation(im):
    global tfidf, answers, X_tfidf
    Y_vec = vectorizer.transform([im])
    Y_tfidf = tfidf.transform(Y_vec)
    angle = np.rad2deg(np.arccos(max(cosine_similarity(Y_tfidf, X_tfidf)[0])))
    if angle > 60:
        return "Sorry, I don't understand."
    else:
        return answers[np.argmax(cosine_similarity(Y_tfidf, X_tfidf)[0])]

def main():
    usr = input("Enter your username: ")
    print(f"Hi {usr}, I am a chatbot. You can ask me questions about electronics.")
    while True:
        im = input("Enter your question: ")
        if im == 'exit':
            print("Bye!")
            break
        print(conversation(im))
        
if __name__ == '__main__':
    main()

Hi Jasleen, I am a chatbot. You can ask me questions about electronics.
so far after i charge the battery it will last about 90 minutes. i have not had any issues with the battery.
Bye!
