## Data Import and Loading

In [None]:
import kagglehub # Import dataset from KaggleHub

path = kagglehub.dataset_download("lainguyn123/student-performance-factors")

print("Path to dataset files:", path)

In [None]:
import pandas as pd # read the csv and store it in a dataframe
import os

print(os.listdir(path))
df = pd.read_csv(os.path.join(path, "StudentPerformanceFactors.csv"), sep=',')

## Data Cleaning and Preprocessing

In [None]:
df.head()

In [None]:
score_features = ['Attendance', 'Previous_Scores', 'Hours_Studied', 'Exam_Score']

In [None]:
score_df = df[score_features]

In [None]:
import seaborn as sns # plotted a heatmap to visualize the correlation between the features
import matplotlib.pyplot as plt

numeric_df = score_df.select_dtypes(include=['number'])
sns.heatmap(numeric_df.corr(), annot=True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

features = [col for col in score_df.columns if col != 'Exam_Score']
plt.figure(figsize=(15, 10))

for i, feature in enumerate(features, 1):
    plt.subplot(2, 3, i)
    plt.scatter(score_df[feature], score_df['Exam_Score'], alpha=0.6)
    plt.xlabel(feature)
    plt.ylabel('Exam_Score')
    plt.title(f'{feature} vs Exam_Score')

plt.tight_layout()
plt.show()

## Model Training

In [None]:
from sklearn.model_selection import train_test_split # split the data into training and testings sets

score_df.dropna(inplace=True, axis=0)
X = score_df.drop(columns=['Exam_Score', 'Previous_Scores', 'Attendance'])
Y = score_df['Exam_Score']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)


In [None]:
X.describe()

In [None]:
from sklearn.linear_model import LinearRegression  # create linear regression model and fit
model = LinearRegression()
model.fit(X_train,Y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
X_test.head()

## Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print("Predictions for the first 5 students:")
print(predictions[:5])
print("Actual scores for the first 5 students:")
print(Y_test.head().values)

print("Mean Squared Error:", mean_squared_error(Y_test, predictions))
print("Mean Absolute Error:", mean_absolute_error(Y_test, predictions))
print("R-squared:", r2_score(Y_test, predictions))


## Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=3, include_bias=False) # transform the features into polynomial features
X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_poly_train,Y_train)

In [None]:
poly_predictions = poly_model.predict(X_poly_test)

## Comparing Linear and Polynomial Regression

In [None]:
print("Predictions for the first 5 students:")
print(poly_predictions[:5])
print("Actual scores for the first 5 students:")
print(Y_test.head().values)

print("Mean Squared Error:", mean_squared_error(Y_test, poly_predictions))
print("Mean Absolute Error:", mean_absolute_error(Y_test, poly_predictions))
print("R-squared:", r2_score(Y_test, poly_predictions))


In [None]:
print("Linear Regression:")
print("MSE:", mean_squared_error(Y_test, predictions))
print("MAE:", mean_absolute_error(Y_test, predictions))
print("R-squared:", r2_score(Y_test, predictions))

print("\nPolynomial Regression:")
print("MSE:", mean_squared_error(Y_test, poly_predictions))
print("MAE:", mean_absolute_error(Y_test, poly_predictions))
print("R-squared:", r2_score(Y_test, poly_predictions))


In [None]:
import matplotlib.pyplot as plt
import numpy as np

hours_test = X_test['Hours_Studied'].values
actual_scores = Y_test.values
predicted_scores = poly_predictions

# sort data based on hours studied to ensure a smooth line in the plot
sort_index = np.argsort(hours_test)
hours_sorted = hours_test[sort_index]
actual_sorted = actual_scores[sort_index]
predicted_sorted = predicted_scores[sort_index]

plt.figure(figsize=(8, 6))
plt.scatter(hours_sorted, actual_sorted, color='blue', label='Actual Scores')
plt.plot(hours_sorted, predicted_sorted, color='red', label='Polynomial Prediction', linewidth=2)
plt.xlabel('Hours Studied')
plt.ylabel('Exam Score')
plt.title('Polynomial Regression: Exam Score vs Hours Studied')
plt.legend()
plt.show()

In [None]:
hours_test = X_test['Hours_Studied'].values
actual_scores = Y_test.values
predicted_scores = predictions

sort_index = np.argsort(hours_test)
hours_sorted = hours_test[sort_index]
actual_sorted = actual_scores[sort_index]
predicted_sorted = predicted_scores[sort_index]

plt.figure(figsize=(8, 6))
plt.scatter(hours_sorted, actual_sorted, color='blue', label='Actual Scores')
plt.plot(hours_sorted, predicted_sorted, color='green', label='Linear Prediction', linewidth=2)
plt.xlabel('Hours Studied')
plt.ylabel('Exam Score')
plt.title('Linear Regression: Exam Score vs Hours Studied')
plt.legend()
plt.show()