In [2]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# optional for embeddings
# from sentence_transformers import SentenceTransformer

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [3]:
df = pd.read_csv("dataset_full.csv")
df.shape
df.columns

Index(['category', 'question_id', 'question', 'answer', 'score'], dtype='object')

In [4]:
# Cell 3: EDA
df.head(10)
df['category'].value_counts()
df['score'].value_counts()   # check distribution of labels


score
2    70
1    70
0    70
Name: count, dtype: int64

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print("Stopwords loaded:", len(stopwords.words('english')))


Stopwords loaded: 198


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Cell: create STOPWORDS and clean_text, then apply to df
from nltk.corpus import stopwords
import re

STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """Lowercase, remove URLs/punctuation, collapse spaces, remove stopwords."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)              # remove URLs
    text = re.sub(r"[^a-z0-9\s]", " ", text)          # keep letters/numbers/space
    text = re.sub(r"\s+", " ", text).strip()          # collapse whitespace
    tokens = [w for w in text.split() if w not in STOPWORDS]
    return " ".join(tokens)

# Apply to your dataframe (adjust filename/column names if needed)
df['answer_clean'] = df['answer'].apply(clean_text)
df[['answer', 'answer_clean']].head(8)


Unnamed: 0,answer,answer_clean
0,I am a self-motivated BCA student passionate a...,self motivated bca student passionate web deve...
1,I am a BCA student.,bca student
2,I am a student.,student
3,I am a good communicator and a quick learner.,good communicator quick learner
4,I am honest.,honest
5,I like watching movies.,like watching movies
6,One of my weaknesses is that I sometimes focus...,one weaknesses sometimes focus much details le...
7,sometimes get nervous when speaking in front o...,sometimes get nervous speaking front large gro...


In [7]:
# Cell 5: target & X
X = df['answer_clean']
y = df['score'].astype(int)


In [15]:
print("Columns:", df.columns)

Columns: Index(['category', 'question_id', 'question', 'answer', 'score',
       'answer_clean'],
      dtype='object')


In [8]:
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))

# X must be a numeric matrix (sparse)
X = vectorizer.fit_transform(df['answer_clean'].astype(str))   # ensure strings
# y must be numeric labels
y = df['score'].astype(int)

print("X shape:", X.shape)
print("y dtype, unique:", y.dtype, y.unique()[:10])

# 3) Now split numeric data

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)
print("y_train sample:", y_train[:10])

X shape: (210, 741)
y dtype, unique: int32 [2 1 0]
X_train shape: (168, 741) X_test shape: (42, 741)
y_train sample: 114    2
188    0
164    0
72     2
201    2
127    1
171    2
55     1
88     1
49     1
Name: score, dtype: int32


In [9]:
# Step 1: Create the model
model = LogisticRegression(max_iter=1000, random_state=42)

# Step 2: Train (fit) the model on training data
model.fit(X_train, y_train)

# Step 3: Predict on test data
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9047619047619048

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.86      0.92        14
           1       0.78      1.00      0.88        14
           2       1.00      0.86      0.92        14

    accuracy                           0.90        42
   macro avg       0.93      0.90      0.91        42
weighted avg       0.93      0.90      0.91        42


Confusion Matrix:
 [[12  2  0]
 [ 0 14  0]
 [ 0  2 12]]


In [11]:
import joblib

# Save the trained model
joblib.dump(model, "interview_model.joblib")

# Save the vectorizer used for text transformation
joblib.dump(vectorizer, "vectorizer.joblib")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
# Load model again to test
loaded_model = joblib.load("interview_model.joblib")
loaded_vectorizer = joblib.load("vectorizer.joblib")

# Try a sample prediction
sample_text = ["I am a hardworking and passionate learner."]
X_sample = loaded_vectorizer.transform(sample_text)
prediction = loaded_model.predict(X_sample)

print("Predicted score:", prediction)


Predicted score: [1]
