In [15]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('test_data.csv')
print("\nTotal rows:", len(df))


Total rows: 1000


In [16]:
df.head()

Unnamed: 0,user_id,test_id,marks,accuracy,time_taken,attempts,difficulty_level,topic_coverage,consistency_score,skill_level
0,145,1,53,69,36,4,medium,64,80,Intermediate
1,173,2,44,47,53,4,hard,98,81,Beginner
2,140,3,73,78,26,2,hard,87,77,Advanced
3,49,4,20,44,52,3,easy,96,94,Beginner
4,183,5,42,31,65,6,medium,59,44,Beginner


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            1000 non-null   int64 
 1   test_id            1000 non-null   int64 
 2   marks              1000 non-null   int64 
 3   accuracy           1000 non-null   int64 
 4   time_taken         1000 non-null   int64 
 5   attempts           1000 non-null   int64 
 6   difficulty_level   1000 non-null   object
 7   topic_coverage     1000 non-null   int64 
 8   consistency_score  1000 non-null   int64 
 9   skill_level        1000 non-null   object
dtypes: int64(8), object(2)
memory usage: 78.3+ KB


In [4]:
df.duplicated().sum()

np.int64(0)

In [5]:
X = df.drop(columns=["skill_level", "user_id", "test_id"])
y = df["skill_level"]

In [6]:
# Encode Categorical Column (difficulty_level)
from sklearn.preprocessing import LabelEncoder

le_difficulty = LabelEncoder()
X["difficulty_level"] = le_difficulty.fit_transform(X["difficulty_level"])
# example mapping: easy → 0, medium → 1, hard → 2

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)


* stratify=y
  * Keeps Beginner / Intermediate / Advanced ratio balanced
  - it makes sure each class appears in train and test in the same proportion.
  - Your model sees a fair representation of classes.
  - stratify=y = “split data but don’t disturb class balance”

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(750, 7)
(250, 7)


- fit only on training data
- transform on test data
#### We train more than one model and compare them properly.
### First Logistic Regression 

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

log_pred = log_model.predict(X_test_scaled)

log_acc = accuracy_score(y_test, log_pred)
print("Logistic Regression Accuracy:", log_acc)
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, log_pred))

Logistic Regression Accuracy: 1.0
Logistic Regression Confusion Matrix:
 [[63  0  0]
 [ 0 94  0]
 [ 0  0 93]]


### Second Random Forest
    and we know it does not need any scaling

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

rf_acc = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_acc)
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))

Random Forest Accuracy: 1.0
Random Forest Confusion Matrix:
 [[63  0  0]
 [ 0 94  0]
 [ 0  0 93]]


In [12]:
import joblib

# Save model
joblib.dump(log_model, "skill_model.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")

# Save difficulty encoder
joblib.dump(le_difficulty, "difficulty_encoder.pkl")

print("ML pipeline saved successfully!")

ML pipeline saved successfully!


In [13]:
import joblib
import pandas as pd

# Load pipeline
model = joblib.load("skill_model.pkl")
scaler = joblib.load("scaler.pkl")
difficulty_encoder = joblib.load("difficulty_encoder.pkl")

# New input (like frontend will send)
input_data = pd.DataFrame([{
    "marks": 78,
    "accuracy": 82,
    "time_taken": 28,
    "attempts": 1,
    "difficulty_level": "hard",
    "topic_coverage": 85,
    "consistency_score": 80
}])

# Encode difficulty
input_data["difficulty_level"] = difficulty_encoder.transform(
    input_data["difficulty_level"]
)

# Scale
input_scaled = scaler.transform(input_data)

# Predict
prediction = model.predict(input_scaled)

print("Predicted Skill Level:", prediction[0])

Predicted Skill Level: Advanced
