In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
np.random.seed(42)

n = 2000

study_hours = np.random.randint(0, 16, n)
practice_hours = np.random.randint(0, 11, n)
test_prep = np.random.randint(0, 2, n)
attendance = np.random.randint(50, 101, n)
parent_edu = np.random.randint(0, 4, n)
sleep_hours = np.random.randint(4, 10, n)
previous_grade = np.random.randint(40, 96, n)

noise = np.random.normal(0, 3, n)

math_score = (
    4 * study_hours
    + 5 * practice_hours
    + 8 * test_prep
    + 0.3 * attendance
    + 6 * parent_edu
    + 2 * sleep_hours
    + 0.5 * previous_grade
    + noise
)

math_score = np.clip(math_score, 0, 100)

df = pd.DataFrame({
    "StudyHours": study_hours,
    "PracticeHours": practice_hours,
    "TestPrep": test_prep,
    "AttendanceRate": attendance,
    "ParentEducationLevel": parent_edu,
    "SleepHours": sleep_hours,
    "PreviousGradeAverage": previous_grade,
    "MathScore": math_score
})

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

X = df.drop(columns=["MathScore"])
y = df["MathScore"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)

print("R2 Score:", r2_score(y_test, preds))


In [None]:
importances = model.feature_importances_
features = X.columns

plt.figure(figsize=(8,6))
plt.barh(features, importances, color='teal')
plt.xlabel("Importance")
plt.title("Feature Importance - MathScore Prediction")
plt.tight_layout()
plt.savefig("feature_importance.png")
plt.show()


In [None]:
sample = X.copy()
sample['pred'] = model.predict(X)

plt.figure(figsize=(8,6))
sns.scatterplot(x='StudyHours', y='pred', data=sample, hue='PracticeHours', palette='coolwarm')
plt.xlabel("Study Hours")
plt.ylabel("Predicted Math Score")
plt.title("Effect of Study Hours & Practice on MathScore")
plt.tight_layout()
plt.savefig("study_vs_score.png")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='PracticeHours', y='pred', data=sample)
plt.xlabel("Practice Hours")
plt.ylabel("Predicted Math Score")
plt.title("Practice Hours vs Predicted MathScore")
plt.tight_layout()
plt.savefig("practice_vs_score.png")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(X.join(sample['pred']).corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation with Predicted MathScore")
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.show()


In [None]:
import pickle

pickle.dump(model, open('math_model.pkl', 'wb'))
pickle.dump(X.columns, open('model_columns.pkl', 'wb'))

In [None]:
from google.colab import files
files.download('math_model.pkl')
files.download('model_columns.pkl')