# Loading the dataset

In [17]:
import pandas as pd

student_df = pd.read_csv("D://Student_Performance//Student_performance_data _.csv")

In [18]:
print(student_df.columns)

Index(['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA',
       'GradeClass'],
      dtype='object')


In [19]:
print(student_df.shape)

(2392, 15)


In [20]:
print(student_df.head)

<bound method NDFrame.head of       StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
0          1001   17       1          0                  2        19.833723   
1          1002   18       0          0                  1        15.408756   
2          1003   15       0          2                  3         4.210570   
3          1004   17       1          0                  3        10.028829   
4          1005   17       1          0                  2         4.672495   
...         ...  ...     ...        ...                ...              ...   
2387       3388   18       1          0                  3        10.680555   
2388       3389   17       0          0                  1         7.583217   
2389       3390   16       1          0                  2         6.805500   
2390       3391   16       1          1                  0        12.416653   
2391       3392   16       1          0                  2        17.819907   

      Absences  Tutor

In [21]:
student_df=student_df.drop('GradeClass',axis=1)

In [22]:
print(student_df.columns)

Index(['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA'],
      dtype='object')


# Check for missing values

In [23]:
missing_values = student_df.isnull().sum()
print(missing_values)


StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
dtype: int64


In [24]:
unique_counts = student_df.nunique()
print(unique_counts)


StudentID            2392
Age                     4
Gender                  2
Ethnicity               4
ParentalEducation       5
StudyTimeWeekly      2392
Absences               30
Tutoring                2
ParentalSupport         5
Extracurricular         2
Sports                  2
Music                   2
Volunteering            2
GPA                  2371
dtype: int64


<u>Demographic Details</u><br>
<b>Age</b>: The age of the students ranges from 15 to 18 years.
<b>Gender</b>: Gender of the students, where 0 represents Male and 1 represents Female.<br>
<b>Ethnicity</b>: The ethnicity of the students, coded as follows:<br>
0: Caucasian
1: African American
2: Asian
3: Other<br>
<b>ParentalEducation</b>: The education level of the parents, coded as follows:<br>
0: None
1: High School
2: Some College
3: Bachelor's
4: Higher<br>
<u>Study Habits</u><br>
<b>StudyTimeWeekly</b>: Weekly study time in hours, ranging from 0 to 20.
<b>Absences</b>: Number of absences during the school year, ranging from 0 to 30.
<b>Tutoring</b>: Tutoring status, where 0 indicates No and 1 indicates Yes.<br>
<u>Parental Involvement</u><br>
<b>ParentalSupport</b>: The level of parental support, coded as follows:<br>
0: None
1: Low
2: Moderate
3: High
4: Very High<br>
E<u>xtracurricular Activities</u><br>
<b>Extracurricular</b>: Participation in extracurricular activities, where 0 indicates No and 1 indicates Yes.
<b>Sports</b>: Participation in sports, where 0 indicates No and 1 indicates Yes.
<b>Music</b>: Participation in music activities, where 0 indicates No and 1 indicates Yes.
<b>Volunteering<b>: Participation in volunteering, where 0 indicates No and 1 indicates Yes.

In [25]:
for column in student_df.columns:
    unique_values = student_df[column].unique()
    print(f"{column}: {unique_values}")


StudentID: [1001 1002 1003 ... 3390 3391 3392]
Age: [17 18 15 16]
Gender: [1 0]
Ethnicity: [0 2 1 3]
ParentalEducation: [2 1 3 4 0]
StudyTimeWeekly: [19.83372281 15.40875606  4.21056977 ...  6.80549964 12.41665266
 17.81990749]
Absences: [ 7  0 26 14 17 10 22  1 11 15 21  9 16 29  2 25 20  5  8 12 27 23  3 13
  6 28  4 18 19 24]
Tutoring: [1 0]
ParentalSupport: [2 1 3 4 0]
Extracurricular: [0 1]
Sports: [0 1]
Music: [1 0]
Volunteering: [0 1]
GPA: [2.92919559 3.04291483 0.11260225 ... 1.14233288 1.80329676 2.14001388]


# Making GPA as target feature

In [27]:
X = student_df.drop(columns=['GPA','StudentID'])  # Feature columns
y = student_df['GPA']                 # Target column

# Using RandomForestRegressor to measure feature importance

In [28]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the RandomForestRegressor on the full data
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)

# Extract feature importances
feature_importance = rf.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(importance_df)

              Feature  Importance
5            Absences    0.853711
4     StudyTimeWeekly    0.056966
7     ParentalSupport    0.034201
6            Tutoring    0.012999
8     Extracurricular    0.008380
9              Sports    0.007364
0                 Age    0.006554
3   ParentalEducation    0.006382
2           Ethnicity    0.005112
10              Music    0.003896
1              Gender    0.002578
11       Volunteering    0.001856


In [29]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting splits
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Test target shape:", y_test.shape)

Training data shape: (1913, 12)
Test data shape: (479, 12)
Training target shape: (1913,)
Test target shape: (479,)


# Using Linear Regression to create ML model.
Since we are predicting GPA which is continuous data, Linear Regression is an efficient ML algorithm for the prediction.

In [30]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
gpa_model = LinearRegression()

# Train the model on the training data
gpa_model.fit(X_train, y_train)

# Display the model coefficients
print("Intercept:", gpa_model.intercept_)
print("Coefficients:", gpa_model.coef_)


Intercept: 2.6051186552967116
Coefficients: [-5.73424324e-03  1.05158594e-02  4.74448753e-03  1.25102133e-04
  2.90388035e-02 -9.95173088e-02  2.58261545e-01  1.47838494e-01
  1.89546339e-01  1.84990827e-01  1.52484274e-01 -5.28171569e-03]


In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test data
y_pred = gpa_model.predict(X_test)

# Calculate and display evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)


Mean Absolute Error (MAE): 0.15529483485748088
R-squared (R²): 0.9532471681022928


In [32]:
import numpy as np

# Filter out zero values from y_test and y_pred
non_zero_indices = y_test != 0
y_test_non_zero = y_test[non_zero_indices]
y_pred_non_zero = y_pred[non_zero_indices]

# Calculate Mean Absolute Percentage Error (MAPE)
if len(y_test_non_zero) > 0:
    mape = np.mean(np.abs((y_test_non_zero - y_pred_non_zero) / y_test_non_zero)) * 100
    accuracy = 100 - mape
else:
    mape = np.inf  # or some indication of undefined
    accuracy = -np.inf  # or some indication of undefined

print("Mean Absolute Percentage Error (MAPE):", mape)
print("Model Accuracy (%):", accuracy)


Mean Absolute Percentage Error (MAPE): 15.052965225012446
Model Accuracy (%): 84.94703477498756


In [33]:
import pickle
# Save the model as a .pkl file
with open("gpa_predictor_model.pkl", "wb") as file:
    pickle.dump(gpa_model, file)

print("Model saved as gpa_predictor_model.pkl")

Model saved as gpa_predictor_model.pkl
