In [245]:
#import data frame
import pandas as pd
students_df = pd.read_csv('study_performance.csv')
students_df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [246]:
# Create a list with the data of score columns 
score_columns = [columna for columna in students_df.columns if columna.endswith('score')]
score_columns

['math_score', 'reading_score', 'writing_score']

In [247]:
# Create a new column with mean of three score columns in base of the previuos list. Score from 0 to 10 points
students_df['score'] = round(students_df[score_columns].sum(axis=1)/30)
students_df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,score
0,female,group B,bachelor's degree,standard,none,72,72,74,7.0
1,female,group C,some college,standard,completed,69,90,88,8.0
2,female,group B,master's degree,standard,none,90,95,93,9.0
3,male,group A,associate's degree,free/reduced,none,47,57,44,5.0
4,male,group C,some college,standard,none,76,78,75,8.0
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,9.0
996,male,group C,high school,free/reduced,none,62,55,55,6.0
997,female,group C,high school,free/reduced,completed,59,71,65,6.0
998,female,group D,some college,standard,completed,68,78,77,7.0


In [248]:
# Delete score columns from df using previous list
students_df.drop(columns = score_columns, inplace = True)
students_df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,score
0,female,group B,bachelor's degree,standard,none,7.0
1,female,group C,some college,standard,completed,8.0
2,female,group B,master's degree,standard,none,9.0
3,male,group A,associate's degree,free/reduced,none,5.0
4,male,group C,some college,standard,none,8.0
...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,9.0
996,male,group C,high school,free/reduced,none,6.0
997,female,group C,high school,free/reduced,completed,6.0
998,female,group D,some college,standard,completed,7.0


In [249]:
# Transform categorical columns in to colums the machine can understand 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

In [250]:
# Instead of gender male or female, split in two columns with 1 or 0, if male 0,1; else female 1,0.
encoded_data = encoder.fit_transform(students_df[['gender']]) #Use double square brackets to create a DF instead of a list.
encoded_df = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out(['gender']))
encoded_df

Unnamed: 0,gender_female,gender_male
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0
...,...,...
995,1.0,0.0
996,0.0,1.0
997,1.0,0.0
998,1.0,0.0


In [251]:
# Delete previous categorical column with genders and concatenate new column with 1/0
students_df = pd.concat([students_df, encoded_df], axis=1)
students_df.drop('gender', axis=1, inplace=True)
students_df

Unnamed: 0,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,score,gender_female,gender_male
0,group B,bachelor's degree,standard,none,7.0,1.0,0.0
1,group C,some college,standard,completed,8.0,1.0,0.0
2,group B,master's degree,standard,none,9.0,1.0,0.0
3,group A,associate's degree,free/reduced,none,5.0,0.0,1.0
4,group C,some college,standard,none,8.0,0.0,1.0
...,...,...,...,...,...,...,...
995,group E,master's degree,standard,completed,9.0,1.0,0.0
996,group C,high school,free/reduced,none,6.0,0.0,1.0
997,group C,high school,free/reduced,completed,6.0,1.0,0.0
998,group D,some college,standard,completed,7.0,1.0,0.0


In [252]:
# Now do the same for test_preparation_course column, as it has the same categorical posibilities than gender
encoded_data = encoder.fit_transform(students_df[['test_preparation_course']]) #Use double square brackets to create a DF instead of a list.
encoded_df = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out(['test_preparation_course']))
students_df = pd.concat([students_df, encoded_df], axis=1)
students_df.drop('test_preparation_course', axis=1, inplace=True)
students_df

Unnamed: 0,race_ethnicity,parental_level_of_education,lunch,score,gender_female,gender_male,test_preparation_course_completed,test_preparation_course_none
0,group B,bachelor's degree,standard,7.0,1.0,0.0,0.0,1.0
1,group C,some college,standard,8.0,1.0,0.0,1.0,0.0
2,group B,master's degree,standard,9.0,1.0,0.0,0.0,1.0
3,group A,associate's degree,free/reduced,5.0,0.0,1.0,0.0,1.0
4,group C,some college,standard,8.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...
995,group E,master's degree,standard,9.0,1.0,0.0,1.0,0.0
996,group C,high school,free/reduced,6.0,0.0,1.0,0.0,1.0
997,group C,high school,free/reduced,6.0,1.0,0.0,1.0,0.0
998,group D,some college,standard,7.0,1.0,0.0,1.0,0.0


In [253]:
# let's check the other categorical columns 
print(students_df['lunch'].unique())
print(students_df['parental_level_of_education'].unique())
print(students_df['race_ethnicity'].unique())

['standard' 'free/reduced']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
['group B' 'group C' 'group A' 'group D' 'group E']


In [254]:
# Lunch column is ths same as before, so we can transform it the same way
encoded_data = encoder.fit_transform(students_df[['lunch']]) #Use double square brackets to create a DF instead of a list.
encoded_df = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out(['lunch']))
students_df = pd.concat([students_df, encoded_df], axis=1)
students_df.drop('lunch', axis=1, inplace=True)
students_df

Unnamed: 0,race_ethnicity,parental_level_of_education,score,gender_female,gender_male,test_preparation_course_completed,test_preparation_course_none,lunch_free/reduced,lunch_standard
0,group B,bachelor's degree,7.0,1.0,0.0,0.0,1.0,0.0,1.0
1,group C,some college,8.0,1.0,0.0,1.0,0.0,0.0,1.0
2,group B,master's degree,9.0,1.0,0.0,0.0,1.0,0.0,1.0
3,group A,associate's degree,5.0,0.0,1.0,0.0,1.0,1.0,0.0
4,group C,some college,8.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
995,group E,master's degree,9.0,1.0,0.0,1.0,0.0,0.0,1.0
996,group C,high school,6.0,0.0,1.0,0.0,1.0,1.0,0.0
997,group C,high school,6.0,1.0,0.0,1.0,0.0,1.0,0.0
998,group D,some college,7.0,1.0,0.0,1.0,0.0,0.0,1.0


In [255]:
# Belonging to one race or another does not present an improvement in itself, so we can treat it as the previous ones.
encoded_data = encoder.fit_transform(students_df[['race_ethnicity']]) #Use double square brackets to create a DF instead of a list.
encoded_df = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out(['race_ethnicity']))
students_df = pd.concat([students_df, encoded_df], axis=1)
students_df.drop('race_ethnicity', axis=1, inplace=True)
students_df

Unnamed: 0,parental_level_of_education,score,gender_female,gender_male,test_preparation_course_completed,test_preparation_course_none,lunch_free/reduced,lunch_standard,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E
0,bachelor's degree,7.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,some college,8.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,master's degree,9.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,associate's degree,5.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,some college,8.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,master's degree,9.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
996,high school,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
997,high school,6.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
998,some college,7.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [256]:
# Parental level of education column can be an important factor, so it may be good to range it in values depending on the level
students_df['parental_level_of_education'].value_counts()

parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [257]:
# In this case import Ordinal Encoder. Rename the new column, so when drop the old one it doesnt drop new  as well.
from sklearn.preprocessing import OrdinalEncoder
ordinal_categories = ["some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"]
encoder = OrdinalEncoder(categories=[ordinal_categories])
encoded_data = encoder.fit_transform(students_df[['parental_level_of_education']])
df_encoded = pd.DataFrame(encoded_data, columns = ['encoded_parental_level_of_education'])
students_df = pd.concat([students_df, df_encoded], axis=1)
students_df.drop('parental_level_of_education', axis = 1, inplace=True)
students_df

Unnamed: 0,score,gender_female,gender_male,test_preparation_course_completed,test_preparation_course_none,lunch_free/reduced,lunch_standard,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E,encoded_parental_level_of_education
0,7.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,4.0
1,8.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0
2,9.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,5.0
3,5.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
4,8.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,9.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,5.0
996,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
997,6.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
998,7.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0


In [273]:
# With the data frame corrected, we move on to the machine learning part
# Import models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [322]:
# Now select the columns to predict and the ones to use as predictors
X = students_df.drop('score', axis = 1)
y = students_df['score']
# Random_state is use as a seed, so it use alwais the same values to train and test. If changed, train and test values can change.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21) # 80% of data is use in the train and 20% in the test of the model 
X_train

Unnamed: 0,gender_female,gender_male,test_preparation_course_completed,test_preparation_course_none,lunch_free/reduced,lunch_standard,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E,encoded_parental_level_of_education
108,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0
134,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0
585,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0
42,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0
990,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
48,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
772,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
824,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
207,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [324]:
# Let's use RandomForest and Linear Regression tests
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train) 

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [326]:
# Use round in predictions to avoid decimals, more human alike
rf_predictions = rf_model.predict(X_test).round()
lr_predictions = lr_model.predict(X_test).round()

In [334]:
# Let's compare both models using mean square error so we can see which is better
rf_mse = mean_squared_error(y_test, rf_predictions)
lr_mse = mean_squared_error(y_test, lr_predictions)

print('Random Forest MSE: ', rf_mse)
print('Linear Regression MSE: ', lr_mse)

Random Forest MSE:  2.225
Linear Regression MSE:  1.865


In [332]:
# Check the model train with the actual data in explicit cases of de data frame
print("\n Samplee Predictions: ")
num_samples = 10

for i in range(num_samples):
    print(f" - Random Forest Predicted: {rf_predictions[i]}, Actual: {y_test.iloc[i]}")
    print(f" - Linear Regression Predicted: {lr_predictions[i]}, Actual: {y_test.iloc[i]}")
    print()
# Linear regression seems to do it betterm but in both cases the error is too high, so this procedure is not good in terms of assessing students


 Samplee Predictions: 
 - Random Forest Predicted: 7.0, Actual: 9.0
 - Linear Regression Predicted: 7.0, Actual: 9.0

 - Random Forest Predicted: 6.0, Actual: 4.0
 - Linear Regression Predicted: 6.0, Actual: 4.0

 - Random Forest Predicted: 5.0, Actual: 7.0
 - Linear Regression Predicted: 6.0, Actual: 7.0

 - Random Forest Predicted: 5.0, Actual: 5.0
 - Linear Regression Predicted: 6.0, Actual: 5.0

 - Random Forest Predicted: 6.0, Actual: 6.0
 - Linear Regression Predicted: 6.0, Actual: 6.0

 - Random Forest Predicted: 7.0, Actual: 10.0
 - Linear Regression Predicted: 7.0, Actual: 10.0

 - Random Forest Predicted: 6.0, Actual: 7.0
 - Linear Regression Predicted: 6.0, Actual: 7.0

 - Random Forest Predicted: 7.0, Actual: 7.0
 - Linear Regression Predicted: 7.0, Actual: 7.0

 - Random Forest Predicted: 7.0, Actual: 7.0
 - Linear Regression Predicted: 7.0, Actual: 7.0

 - Random Forest Predicted: 7.0, Actual: 7.0
 - Linear Regression Predicted: 7.0, Actual: 7.0

