In [1]:
# import Required Modules

import pandas as pd
import math

import warnings
warnings.filterwarnings('ignore')

In [6]:
# Reading the StudentsPerformance.csv file

df = pd.read_csv('StudentsPerformance.csv')
df_mod = df.copy()
df_mod.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [8]:
print(df_mod.shape)

(1000, 8)


# Strategy:

* We have 5 independent features and trying to analysis the 3 features such as math score, 
reading score and writing score for those we only have 1000 data points.So rather than predicting the score in each subject
we will try to predict the grade of the student.

* We will calculate the percentage mark of each student and on the basis of the percentage mark , we will devide the students into
7 categories and then with our model we will try to predict those categories.


## Calculating the percentage score by each student

In [10]:
df_mod['percentage'] = (df['math score'] + df['reading score'] + df['writing score'])/3
for i in range(0, 1000):
    df_mod['percentage'][i] = math.ceil(df_mod['percentage'][i])
df_mod.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,percentage
0,female,group B,bachelor's degree,standard,none,72,72,74,73.0
1,female,group C,some college,standard,completed,69,90,88,83.0
2,female,group B,master's degree,standard,none,90,95,93,93.0
3,male,group A,associate's degree,free/reduced,none,47,57,44,50.0
4,male,group C,some college,standard,none,76,78,75,77.0


In [11]:
def getgrade(per):
    
      if(per >= 90):
        return 'O'
      elif(per >= 80):
        return 'E'
      elif(per >= 70):
        return 'A'
      elif(per >= 60):
        return 'B'
      elif(per >= 45):
        return 'C'
      elif(per >= 30):
        return 'D'
      else :
        return 'F'
    

In [12]:
df_mod['grade'] = df_mod.apply(lambda x: getgrade(x['percentage']), axis = 1)
df_mod.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,percentage,grade
0,female,group B,bachelor's degree,standard,none,72,72,74,73.0,A
1,female,group C,some college,standard,completed,69,90,88,83.0,E
2,female,group B,master's degree,standard,none,90,95,93,93.0,O
3,male,group A,associate's degree,free/reduced,none,47,57,44,50.0,C
4,male,group C,some college,standard,none,76,78,75,77.0,A


In [13]:
df_mod['grade'].value_counts()

A    260
B    252
C    225
E    156
O     58
D     43
F      6
Name: grade, dtype: int64

### One hot Encoding of Categorical Variable:

In [14]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

df_mod['gender'] = ohe.fit_transform(df['gender'].values.reshape(-1,1)).toarray()

df_race_ohe = pd.DataFrame(ohe.fit_transform(df[['race/ethnicity']]).toarray())

df_race_ohe.columns = ohe.get_feature_names(['race/ethnicity'])

df_par_ohe = pd.DataFrame(ohe.fit_transform(df[['parental level of education']]).toarray())

df_par_ohe.columns = ohe.get_feature_names(['parental level of education'])

df_lunch_ohe = pd.DataFrame(ohe.fit_transform(df[['lunch']]).toarray())

df_lunch_ohe.columns = ohe.get_feature_names(['lunch'])

df_course_ohe = pd.DataFrame(ohe.fit_transform(df[['test preparation course']]).toarray())

df_course_ohe.columns = ohe.get_feature_names(['test preparation course'])

df_course_ohe.head()

Unnamed: 0,test preparation course_completed,test preparation course_none
0,0.0,1.0
1,1.0,0.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [18]:
list_df_oh = [df_race_ohe, df_par_ohe, df_lunch_ohe, df_course_ohe]

In [19]:
merged_df = df_mod.copy()

for _ in list_df_oh:
    merged_df = merged_df.join(_)
    

merged_df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,percentage,grade,...,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,1.0,group B,bachelor's degree,standard,none,72,72,74,73.0,A,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,group C,some college,standard,completed,69,90,88,83.0,E,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.0,group B,master's degree,standard,none,90,95,93,93.0,O,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,group A,associate's degree,free/reduced,none,47,57,44,50.0,C,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,group C,some college,standard,none,76,78,75,77.0,A,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [28]:
#df.columns

list_of_col_drop = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 
                   'math score', 'reading score', 'writing score', 'percentage']

In [31]:
final_df = merged_df.drop(columns = list_of_col_drop)
final_df.head()

Unnamed: 0,grade,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,A,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,E,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,O,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,C,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,A,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [32]:
# Saving the prepared data frame to a csv file for further proceeding 

final_df.to_csv('prepared.csv')