## Create an Environment for the chatbot

In [33]:
!pip install nltk wikipedia scikit-learn



In [34]:
# Import libraries

# Main Libraries
import nltk
import wikipedia as wk
import random
import string
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict

# For NLP
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# For TF-IDF and similarity
from sklearn.feature_extraction.text import TfidfVectorizer # Scikit-learn's TfidfVectorizer transforms text documents into a TF-IDF matrix, weighting terms by frequency and inverse document frequency.
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel


In [35]:
# Dataset
nltk.download('punkt')  # For tokenization
nltk.download('wordnet')  # For lemmatization
nltk.download('stopwords')  # To filter stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# Read the data 
data = open("Text.txt", "r", errors="ignore")
raw = data.read()

raw = raw.lower()

print(raw[-1000:])

l of their previous records. their sixth album, the hunting party (2014), returned to a heavier rock sound, while their seventh album, one more light (2017), was a substantially more pop-oriented record. the band's eighth album, from zero, was released in november 2024.

linkin park is among both the best-selling bands of the 21st century and the world's best-selling music artists, having sold over 100 million records worldwide.[6] they have won two grammy awards, six american music awards, four billboard music awards, four mtv video music awards, 10 mtv europe music awards, and three world music awards. in 2003, mtv2 named linkin park the sixth-greatest band of the music video era and the third-best of the new millennium. billboard ranked linkin park no. 19 on the best artists of the decade list. in 2012, the band was voted as the greatest artist of the 2000s in a bracket madness poll on vh1. in 2014, the band was declared "the biggest rock band in the world right now" by kerrang!.




In [37]:
# Tokenization 

# Sentences
sent_tokens = nltk.sent_tokenize(raw)

# Words
word_tokens = nltk.word_tokenize(raw)

print(sent_tokens)
print(word_tokens)

["linkin park is an american rock band formed in agoura hills, california, in 1996. the band's current lineup consists of vocalist/rhythm guitarist/keyboardist mike shinoda, lead guitarist brad delson, dj/turntablist joe hahn, bassist dave farrell, co-lead vocalist emily armstrong, and drummer colin brittain.", "the lineup for the band's first seven studio albums included lead vocalist chester bennington and drummer rob bourdon until bennington's suicide in july 2017, which caused the band to enter an indefinite hiatus.", "in september 2024, linkin park's reformation was announced along with the addition of armstrong and brittain.", "categorized mainly as alternative rock and nu metal, linkin park's earlier music spanned a fusion of heavy metal and hip hop, while their later music features more electronica and pop elements.", 'linkin park rose to international fame with their debut studio album, hybrid theory (2000), which became certified diamond by the recording industry association 

In [38]:
# Normalizaztion
import unicodedata

lemmatizer = WordNetLemmatizer()

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

word_tokens = nltk.word_tokenize(raw.lower().translate(remove_punct_dict))
print(word_tokens)

new_words = [] 
for word in word_tokens:
    new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    new_words.append(new_word)

lemmatized_words = [lemmatizer.lemmatize(word) for word in new_words if word]

print(lemmatized_words[:5])

['linkin', 'park', 'is', 'an', 'american', 'rock', 'band', 'formed', 'in', 'agoura', 'hills', 'california', 'in', '1996', 'the', 'bands', 'current', 'lineup', 'consists', 'of', 'vocalistrhythm', 'guitaristkeyboardist', 'mike', 'shinoda', 'lead', 'guitarist', 'brad', 'delson', 'djturntablist', 'joe', 'hahn', 'bassist', 'dave', 'farrell', 'colead', 'vocalist', 'emily', 'armstrong', 'and', 'drummer', 'colin', 'brittain', 'the', 'lineup', 'for', 'the', 'bands', 'first', 'seven', 'studio', 'albums', 'included', 'lead', 'vocalist', 'chester', 'bennington', 'and', 'drummer', 'rob', 'bourdon', 'until', 'benningtons', 'suicide', 'in', 'july', '2017', 'which', 'caused', 'the', 'band', 'to', 'enter', 'an', 'indefinite', 'hiatus', 'in', 'september', '2024', 'linkin', 'parks', 'reformation', 'was', 'announced', 'along', 'with', 'the', 'addition', 'of', 'armstrong', 'and', 'brittain', 'categorized', 'mainly', 'as', 'alternative', 'rock', 'and', 'nu', 'metal', 'linkin', 'parks', 'earlier', 'music', '

In [47]:
# Chatbot Function
import numpy as np

def generate_response(user_input):
    global sent_tokens
    sent_tokens.append(user_input)
    
    # Calculate TF-IDF
    vectorizer = TfidfVectorizer(tokenizer=lemmatizer.lemmatize, stop_words='english')
    tfidf = vectorizer.fit_transform(sent_tokens)
    
    # Cosine similarity
    similarity_scores = cosine_similarity(tfidf[-1], tfidf[:-1])
    index = np.argmax(similarity_scores)
    flat = similarity_scores.flatten()
    max_score = flat[index]
    
    # Remove user input from tokens
    sent_tokens.pop(-1)
    
    # Response based on similarity
    if max_score > 0.1:
        return sent_tokens[index]
    else:
        return "I'm sorry, I didn't understand your question."

# Check: Example response
user_input = "Album Meteora"
print("Chatbot:", generate_response(user_input))


Chatbot: the band's fifth album, living things (2012), combined musical elements from all of their previous records.
