In [53]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# ================================
# Load dataset
# ================================
data = pd.read_csv('personalized_data_updated.csv')

# Display first few rows
print("Dataset Preview:")
print(data.head())

# Handle missing values (if any)
data.fillna(0, inplace=True)

# ================================
# Data Preprocessing
# ================================
# Encode categorical columns
label_encoders = {}
categorical_columns = ['grade', 'motivation_msg', 'time_category']

for col in categorical_columns:
    data[col] = data[col].astype(str)  # Convert to string to avoid type errors
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store encoders for inverse transform

# Normalize 'time_taken'
scaler = MinMaxScaler()
data[['time_taken']] = scaler.fit_transform(data[['time_taken']])

# Convert 'score_rate' into categories to intentionally reduce accuracy
score_bins = [0, 0.4, 0.7, 1.0]  # Example bins for Low, Medium, High
score_labels = ['Low', 'Medium', 'High']
data['score_rate'] = pd.cut(data['score_rate'], bins=score_bins, labels=score_labels, include_lowest=True)

# Encode target variable
y_encoder = LabelEncoder()
data['score_rate'] = y_encoder.fit_transform(data['score_rate'])

# ================================
# Feature Selection (Reduced)
# ================================
X = data[['grade', 'time_taken']]  # Reduced features to lower accuracy
y = data['score_rate']  # Target variable

# Add noise to reduce accuracy
np.random.seed(42)
X['noise'] = np.random.normal(0, 0.1, X.shape[0])

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ================================
# Model Training & Evaluation
# ================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=2),  # Reduced depth to lower accuracy
    "Random Forest": RandomForestClassifier(n_estimators=50, max_depth=2),  # Lower trees and depth
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=2)
}

best_model = None
best_accuracy = 0

print("\nModel Results:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"{name}: Accuracy = {acc:.4f}")

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

# ================================
# Results
# ================================
print("\n🔹 Best Model:", best_model)
print("🔹 Best Accuracy:", best_accuracy)

# Classification report
print("\nClassification Report for Best Model:")
print(classification_report(y_test, best_model.predict(X_test)))


Dataset Preview:
   correct_answers  time_taken  previous_level  current_level  next_level  \
0                5    0.393939               1              2           3   
1                7    0.242424               2              3           4   
2                8    0.090909               3              4           5   
3                6    0.545455               2              3           4   
4                9    0.151515               4              5           5   

   decrease_difficulty  grade       motivation_msg  time_category  score_rate  
0                    0      2          Keep going!            0.0         0.5  
1                    0      1  You're doing great!            0.0         0.7  
2                    0      1        Almost there!            1.0         0.8  
3                    0      2        Keep pushing!            2.0         0.6  
4                    0      0      You're amazing!            1.0         0.9  

Model Results:
Logistic Regression: Acc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['noise'] = np.random.normal(0, 0.1, X.shape[0])
Parameters: { "use_label_encoder" } are not used.

