In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Loading the dataset into the notebook

data = pd.read_csv("lms.csv")
data

FileNotFoundError: [Errno 2] No such file or directory: 'lms.csv'

In [None]:
# Description of the data

data.describe()

In [None]:
# Checking for missing values

missing_values = data.isnull().sum()
print(missing_values)

In [None]:
# Checking for duplicate data

print("\nIndices of duplicate rows:")
print(data[data.duplicated()].index)

In [None]:
# Splitting data into features and target variable

X = data[['assignments_viewed', 'assignments_submitted', 'quiz_started', 'quiz_submitted', 
          'quiz_reviewed', 'quiz_viewed', 'forums_viewed', 'page_views', 'resources_viewed','quiz_1','quiz_2','assignment','project','final_exam']]
y = data['total_score']  # Target variable

In [None]:
# Splitting the dataset into testing and training sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model training

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Calculate regression evaluation metrics

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Calculate RMSE by setting squared=False
r2 = r2_score(y_test, y_pred)

# Print regression evaluation metrics
print("Regression Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)

In [None]:
# Checking feature significance

coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
coef_df.sort_values(by='Coefficient', ascending=False, inplace=True)

print("\nFeature Significance:")
print(coef_df)

In [None]:
# Scatter plot of predicted vs. Actual scores
# Gives an understanding of how well the model performs in predicting the end-of-semester academic performance.

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')  # Perfect prediction line
plt.xlabel('Actual Scores')
plt.ylabel('Predicted Scores')
plt.title('Scatter Plot of Predicted vs. Actual Scores')
plt.grid(True)
plt.show()

In [None]:
# Residual plot
# Residual is the differences between actual and predicted values.
# Helps to detect patterns in the residuals and assess the model's performance.

residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Scores')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True)
plt.show()

In [None]:
# Feature importance plot
# Coefficient is the weight assigned to each feature (x independent variable) in the model.
# Use the coeficicients of the model to provide insights into the importance of each feature in predicting the end-of-semester academic performance.

plt.figure(figsize=(10, 6))
sorted_indices = coef_df['Coefficient'].abs().sort_values(ascending=False).index
plt.bar(range(len(sorted_indices)), coef_df['Coefficient'][sorted_indices], color='skyblue')
plt.xticks(range(len(sorted_indices)), coef_df['Feature'][sorted_indices], rotation=90)
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.title('Feature Importance Plot')
plt.show()


In [None]:
# Distibution of predicted scores
# Gives an overview of how the model predicts the end-of-semester academic performance across different score ranges.

plt.figure(figsize=(8, 6))
plt.hist(y_pred, bins=20, color='lightgreen', edgecolor='black')
plt.xlabel('Predicted Scores')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Scores')
plt.grid(True)
plt.show()

In [None]:
import pickle
#Save the model using pickle
with open('linear_regression_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)