In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [22]:
df = pd.read_csv("./StudentsPerformance.csv")
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,,57.0,44.0
4,male,group C,some college,standard,none,76.0,78.0,75.0
5,female,group B,associate's degree,standard,none,71.0,83.0,78.0
6,female,group B,some college,standard,completed,88.0,95.0,92.0
7,male,group B,some college,free/reduced,none,40.0,43.0,39.0
8,male,group D,high school,free/reduced,completed,64.0,64.0,67.0
9,female,group B,high school,free/reduced,,38.0,,50.0


In [23]:
df.shape

(50, 8)

In [24]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        2
math score                     3
reading score                  7
writing score                  1
dtype: int64

In [25]:
df.dtypes

gender                          object
race/ethnicity                  object
parental level of education     object
lunch                           object
test preparation course         object
math score                     float64
reading score                   object
writing score                  float64
dtype: object

In [26]:
df_obj = df.select_dtypes('object')
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

In [27]:
null_rows = df.isnull().any(axis=1)
df = df.replace(np.nan, 0.0)

In [28]:
df["reading score"] = df["reading score"].replace("","0")
df["reading score"] = df["reading score"].astype(float)
df["test preparation course"].replace(0.0, "none", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["test preparation course"].replace(0.0, "none", inplace=True)


In [29]:
df[null_rows == True]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
1,female,group C,some college,standard,completed,69.0,0.0,88.0
3,male,group A,associate's degree,free/reduced,none,0.0,57.0,44.0
9,female,group B,high school,free/reduced,none,38.0,0.0,50.0
18,male,group C,master's degree,free/reduced,completed,46.0,0.0,46.0
19,female,group C,associate's degree,free/reduced,none,58.0,0.0,61.0
23,female,group C,some high school,standard,none,69.0,73.0,0.0
28,male,group C,high school,standard,none,70.0,0.0,65.0
29,female,group D,master’s degree,standard,none,0.0,0.0,75.0
30,female,group D,some college,standard,none,0.0,0.0,74.0
44,female,group E,associate’s degree,free/reduced,none,50.0,0.0,54.0


In [30]:
le = LabelEncoder()
gender_label = le.fit_transform(df['gender'])
df["gender"] = gender_label
print(gender_label)
df.head()

[0 0 0 1 1 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 1 1 1 0
 0 0 1 1 0 0 1 0 1 0 0 0 1]


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,0,group C,some college,standard,completed,69.0,0.0,88.0
2,0,group B,master's degree,standard,none,90.0,95.0,93.0
3,1,group A,associate's degree,free/reduced,none,0.0,57.0,44.0
4,1,group C,some college,standard,none,76.0,78.0,75.0


In [31]:
race_label = le.fit_transform(df["race/ethnicity"])
test_label = le.fit_transform(df["test preparation course"])
test_label = list(map(lambda x: 1 if x == 0 else 0, test_label))
df["race/ethnicity"] = race_label
df["test preparation course"] = test_label
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,bachelor's degree,standard,0,72.0,72.0,74.0
1,0,2,some college,standard,1,69.0,0.0,88.0
2,0,1,master's degree,standard,0,90.0,95.0,93.0
3,1,0,associate's degree,free/reduced,0,0.0,57.0,44.0
4,1,2,some college,standard,0,76.0,78.0,75.0


In [32]:
df.drop(columns=['parental level of education', 'lunch'],axis=1,inplace=True)
df.head()

Unnamed: 0,gender,race/ethnicity,test preparation course,math score,reading score,writing score
0,0,1,0,72.0,72.0,74.0
1,0,2,1,69.0,0.0,88.0
2,0,1,0,90.0,95.0,93.0
3,1,0,0,0.0,57.0,44.0
4,1,2,0,76.0,78.0,75.0


In [None]:
scaler = MinMaxScaler()
mmscaled = scaler.fit(df)
