In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

**Reading data into Dataframe**

In [2]:
df = pd.read_csv("student-performance-data.csv")
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4


**Creating dataframe for Features under observation**

In [3]:
df_observation = df[["Age", "StudyTimeWeekly", "Absences", "GPA"]]
df_observation.head()

Unnamed: 0,Age,StudyTimeWeekly,Absences,GPA
0,17,19.833723,7,2.929196
1,18,15.408756,0,3.042915
2,15,4.21057,26,0.112602
3,17,10.028829,14,2.054218
4,17,4.672495,17,1.288061


**Statistical Analysis of 'df_observation'**

In [4]:
df_observation.describe()

Unnamed: 0,Age,StudyTimeWeekly,Absences,GPA
count,2392.0,2392.0,2392.0,2392.0
mean,16.468645,9.771992,14.541388,1.906186
std,1.123798,5.652774,8.467417,0.915156
min,15.0,0.001057,0.0,0.0
25%,15.0,5.043079,7.0,1.174803
50%,16.0,9.705363,15.0,1.893393
75%,17.0,14.40841,22.0,2.622216
max,18.0,19.978094,29.0,4.0


**Splitting "df_observation" into Feature Matrix & Target Vector**

In [5]:
target = "GPA"
X = df_observation.drop(columns="GPA")
y = df_observation[target]

**Splitting dataset into training and test sets**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

print("X_train: shape ", X_train.shape)
print("X_test: shape ", X_test.shape)
print("y_train: shape ", y_train.shape)
print("y_test: shape ", y_test.shape)

X_train: shape  (1913, 3)
X_test: shape  (479, 3)
y_train: shape  (1913,)
y_test: shape  (479,)


**Building of Model**

In [7]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

**Extract Feature Importance from the Model**

In [8]:
feature_importances = rf.feature_importances_
feature_importances

array([0.01772996, 0.12034362, 0.86192642])

**Print Feature Importances with Feature Names**

In [9]:
feature_names = ["Age", "StudyTimeWeekly", "Absences"]

In [11]:
for feature, importance in zip(feature_names, feature_importances):
    print(f"{feature}: {importance: .4f}")

Age:  0.0177
StudyTimeWeekly:  0.1203
Absences:  0.8619


**Interpretation of Result**

From the above result, 'Absences' take the highest value according to the model. This means that the presence or absence of students in class impact most in students' academic performances.  