# TF-IDF + Cosine Similarity Chatbot (Cornell Movie Dialogs Corpus)
A simple NLP chatbot using TF-IDF and cosine similarity for conversational response matching.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# STEP 1: Load the Cornell Movie Dialogs Dataset
lines = pd.read_csv('cornell movie-dialogs corpus/movie_lines.txt', sep=' \+\+\+\$\+\+\+ ', 
                    engine='python', names=["lineID", "characterID", "movieID", "character", "text"])

conversations = pd.read_csv('cornell movie-dialogs corpus/movie_conversations.txt', sep=' \+\+\+\$\+\+\+ ',
                            engine='python', names=["character1", "character2", "movieID", "utteranceIDs"])

# Convert utteranceIDs from string to list
conversations['utteranceIDs'] = conversations['utteranceIDs'].apply(ast.literal_eval)

In [None]:
# STEP 2: Extract Conversational Pairs (input-response)
line_dict = dict(zip(lines['lineID'], lines['text']))

pairs = []
for conv in conversations['utteranceIDs']:
    for i in range(len(conv) - 1):
        input_line = line_dict.get(conv[i], "")
        response_line = line_dict.get(conv[i + 1], "")
        if input_line and response_line:
            pairs.append((input_line, response_line))

chat_df = pd.DataFrame(pairs, columns=["input", "response"])
chat_df.drop_duplicates(inplace=True)
chat_df.reset_index(drop=True, inplace=True)

In [None]:
# STEP 3: Preprocess the Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

chat_df['clean_input'] = chat_df['input'].apply(clean_text)

In [None]:
# STEP 4: Create TF-IDF Matrix for User Inputs
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf_matrix = vectorizer.fit_transform(chat_df['clean_input'])

In [None]:
# STEP 5: Chatbot Function Using Cosine Similarity
def chatbot_response(user_input):
    cleaned_input = clean_text(user_input)
    user_vec = vectorizer.transform([cleaned_input])
    similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
    best_match_idx = similarities.argmax()
    if similarities[best_match_idx] > 0:
        return chat_df.iloc[best_match_idx]['response']
    else:
        return "I'm not sure how to respond to that."

In [None]:
# STEP 6: Simple CLI Chat Loop (Optional for local use)
print("Bot: Hello! Type 'bye' to exit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'bye':
        print("Bot: Goodbye!")
        break
    reply = chatbot_response(user_input)
    print("Bot:", reply)