In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from statsmodels.api import OLS, add_constant
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import gaussian_kde

# Load the cleaned data
df = pd.read_csv('../data/processed/cleaned_data.csv')

# Define features and target
X = df.drop('Exam_Score', axis=1)
y = df['Exam_Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Backward elimination function
def backward_elimination(X, y):
    X_with_const = add_constant(X)
    model = OLS(y, X_with_const).fit()
    print("Initial p-values:")
    print(model.pvalues)
    while True:
        p_values = model.pvalues.iloc[1:]  # Exclude constant term
        max_p_value = p_values.max()
        print(f"Max p-value: {max_p_value:.4f}")
        if max_p_value > 0.05:  # Significance level threshold
            excluded_feature = p_values.idxmax()
            print(f"Dropping feature: {excluded_feature} with p-value: {max_p_value:.4f}")
            X = X.drop(columns=[excluded_feature])
            model = OLS(y, add_constant(X)).fit()
        else:
            break
    return model, X.columns

# Fit the model using backward elimination
final_model, selected_features = backward_elimination(X_train, y_train)
print(final_model.summary())

# Model evaluation function
predictions = 0
def evaluate_model(model, X_test, y_test, selected_features):
    X_test_filtered = X_test[selected_features]
    X_test_with_const = add_constant(X_test_filtered)
    predictions = model.predict(X_test_with_const)
    r_squared = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    print("\nModel Evaluation Metrics:")
    print(f"R-squared: {r_squared:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Evaluate the model
evaluate_model(final_model, X_test, y_test, selected_features)

# Plot the regression line for Hours_Studied vs. Exam_Score

X = df[['Attendance']]
y = df['Exam_Score']
# Create a linear regression model
model = LinearRegression()
model.fit(X, y)
# Predict values
y_pred = model.predict(X)
# Calculate point density using Gaussian KDE
xy = np.vstack([X['Attendance'], y])
# Calculate density
z = gaussian_kde(xy)(xy)  
# Normalize the density values for color mapping
z = (z - z.min()) / (z.max() - z.min())  # Normalize to [0, 1]
# Plotting the data points with density-based color
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X, y, c=z, s=100, edgecolor='black', alpha=0.7, cmap='viridis', label='Data Points', marker='o')
# Plotting the regression line
plt.plot(X, y_pred, color='tomato', linewidth=2.5, linestyle='--', label='Regression Line')
# Adding labels and title with enhanced aesthetics
plt.xlabel('Attendance (%)', fontsize=16)
plt.ylabel('Exam Score', fontsize=16)
plt.title('Regression Line with Data Points (Density Colored)', fontsize=20)
plt.legend(fontsize=14)
plt.grid(color='gray', linestyle='--', linewidth=0.5)
# Show color bar for density representation
cbar = plt.colorbar(scatter)
cbar.set_label('Point Density')
# Show plot
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.savefig('../reports/figures/regression_line.png')
plt.close()

Initial p-values:
const                          0.000000e+00
Hours_Studied                  0.000000e+00
Attendance                     0.000000e+00
Parental_Involvement          2.252884e-121
Access_to_Resources           1.045109e-123
Extracurricular_Activities     3.309220e-22
Sleep_Hours                    5.428518e-01
Previous_Scores               5.852143e-120
Motivation_Level               2.261610e-33
Internet_Access                1.221407e-17
Tutoring_Sessions              1.112693e-95
Family_Income                  3.494113e-42
Teacher_Quality                5.548306e-01
School_Type                    9.769793e-01
Peer_Influence                 8.338584e-14
Physical_Activity              2.505874e-10
Learning_Disabilities          1.156969e-20
Parental_Education_Level       1.884156e-31
Distance_from_Home             1.656248e-24
Gender                         7.878864e-01
dtype: float64
Max p-value: 0.9770
Dropping feature: School_Type with p-value: 0.9770
Max p-value: 0.7