In [1]:
import pandas as pd
import numpy as np
import pickle
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load Dataset
df = pd.read_csv(r"C:\Users\jasim\Data Science\My Project\Interview Chatbot\data\Q&A_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Topic,Difficulty,Question,Answer
0,python,beginner,What are Python’s main data types?,"int (integers), float (decimal numbers), str..."
1,python,beginner,What is the difference between a list and a tu...,List : Mutable (Can be changed after creation...
2,python,beginner,What are mutable and immutable data types? Giv...,Mutable: Objects whose state (value) can be m...
3,python,beginner,How do you check the data type of a variable i...,Use the built-in type() function. x = 10.5 y =...
4,python,beginner,"What is a Python dictionary, and how do you ac...",A Python dictionary is an unordered collection...


In [4]:
# Basic Preprocessing
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [5]:
df['clean_answer'] = df['Answer'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,Topic,Difficulty,Question,Answer,clean_answer
0,python,beginner,What are Python’s main data types?,"int (integers), float (decimal numbers), str...",int integers float decimal numbers str strin...
1,python,beginner,What is the difference between a list and a tu...,List : Mutable (Can be changed after creation...,list mutable can be changed after creation a...
2,python,beginner,What are mutable and immutable data types? Giv...,Mutable: Objects whose state (value) can be m...,mutable objects whose state value can be modi...
3,python,beginner,How do you check the data type of a variable i...,Use the built-in type() function. x = 10.5 y =...,use the builtin type function x 105 y 1 2 3 ...
4,python,beginner,"What is a Python dictionary, and how do you ac...",A Python dictionary is an unordered collection...,a python dictionary is an unordered collection...


In [7]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_answer'])

In [8]:
def get_question(topic=None, difficulty=None):
    filtered = df
    if topic:
        filtered = filtered[filtered['Topic'].str.lower() == topic.lower()]
    if difficulty:
        filtered = filtered[filtered['Difficulty'].str.lower() == difficulty.lower()]
    if filtered.empty:
        return None
    row = filtered.sample(1).iloc[0]
    return row['Question'], row['Answer']

In [9]:
def evaluate_answer(user_answer, correct_answer):
    user_clean = clean_text(user_answer)
    answers = [user_clean, clean_text(correct_answer)]
    vectors = vectorizer.transform(answers)
    sim = cosine_similarity(vectors[0], vectors[1])[0][0]
    if sim > 0.75:
        return f" Correct! (Similarity: {sim:.2f})"
    elif sim > 0.4:
        return f" Almost correct! (Similarity: {sim:.2f})"
    else:
        return f" Incorrect. Try again! (Similarity: {sim:.2f})"

In [10]:
with open(r"C:\Users\jasim\Data Science\My Project\Interview Chatbot\model\vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [11]:
with open(r"C:\Users\jasim\Data Science\My Project\Interview Chatbot\model\qa_data.pkl", "wb") as f:
    pickle.dump(df, f)