In [9]:
import pandas as pd

df = pd.DataFrame({'student_id':[1, 2, 3, 4, 5], 'math_score':[75, 88, 95, 65, 50], 'english_score':[82, 79, 91, 70, 60], 'gender':['F', 'M', 'M', 'F', 'F'], 'school_type':['public', 'private', 'private', 'public', 'public']})

df.head()

Unnamed: 0,student_id,math_score,english_score,gender,school_type
0,1,75,82,F,public
1,2,88,79,M,private
2,3,95,91,M,private
3,4,65,70,F,public
4,5,50,60,F,public


In [23]:
# Standardizing (z-score)
from sklearn.preprocessing import StandardScaler

scaler_z = StandardScaler() # Initializing object
scaled_z = scaler_z.fit_transform(df[['math_score', 'english_score']]) # .fit() = find mean and std, .transform() = standardizing

df_scaled_z = df.copy() # Keep the original data clean
df_scaled_z[['math_z', 'english_z']] = scaled_z
print(df_scaled_z)

   student_id  math_score  english_score gender school_type    math_z  \
0           1          75             82      F      public  0.024872   
1           2          88             79      M     private  0.833215   
2           3          95             91      M     private  1.268476   
3           4          65             70      F      public -0.596930   
4           5          50             60      F      public -1.529633   

   english_z  
0   0.528584  
1   0.245414  
2   1.378095  
3  -0.604096  
4  -1.547997  


In [29]:
# Normalizing (0-1 interval)
from sklearn.preprocessing import MinMaxScaler

scaler_mm = MinMaxScaler() # Initializing object
scaled_mm = scaler_mm.fit_transform(df[['math_score', 'english_score']]) # .fit() = find max and min, .transform() = normalizing

df_scaled_mm = df.copy()
df_scaled_mm[['math_mm', 'english_mm']] = scaled_mm
print(df_scaled_mm)

   student_id  math_score  english_score gender school_type   math_mm  \
0           1          75             82      F      public  0.555556   
1           2          88             79      M     private  0.844444   
2           3          95             91      M     private  1.000000   
3           4          65             70      F      public  0.333333   
4           5          50             60      F      public  0.000000   

   english_mm  
0    0.709677  
1    0.612903  
2    1.000000  
3    0.322581  
4    0.000000  


In [73]:
df_scaled = pd.concat([df, pd.DataFrame(scaled_z, columns=['math_z', 'english_z']), pd.DataFrame(scaled_mm, columns=['math_mm', 'english_mm'])], axis = 1)
print(df_scaled)

   student_id  math_score  english_score gender school_type    math_z  \
0           1          75             82      F      public  0.024872   
1           2          88             79      M     private  0.833215   
2           3          95             91      M     private  1.268476   
3           4          65             70      F      public -0.596930   
4           5          50             60      F      public -1.529633   

   english_z   math_mm  english_mm  
0   0.528584  0.555556    0.709677  
1   0.245414  0.844444    0.612903  
2   1.378095  1.000000    1.000000  
3  -0.604096  0.333333    0.322581  
4  -1.547997  0.000000    0.000000  


In [75]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

le_gender = LabelEncoder() # Initializing
le_gender_encode = le_gender.fit_transform(df['gender']) # .fit() = map letters to numbers, .tansform() = transform letters to numbers

df_encoded_le = df.copy()
df_encoded_le['gender_label'] = le_gender_encode
print(df_encoded_le)

   student_id  math_score  english_score gender school_type  gender_label
0           1          75             82      F      public             0
1           2          88             79      M     private             1
2           3          95             91      M     private             1
3           4          65             70      F      public             0
4           5          50             60      F      public             0


In [81]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe_school = OneHotEncoder(sparse_output = False, drop = 'first') # 'drop' since we only need one column, 'drop = first' means we will drop 'public'
ohe_school_encode = ohe.fit_transform(df[['school_type']]) # .fit() = scan all type, .transform() = transform type into number

encoded_df_ohe = pd.DataFrame(ohe_school_encode, columns = ohe.get_feature_names_out(['school_type']))
df_encoded_ohe = pd.concat([df, encoded_df_ohe], axis = 1)
print(df_encoded_ohe)

   student_id  math_score  english_score gender school_type  \
0           1          75             82      F      public   
1           2          88             79      M     private   
2           3          95             91      M     private   
3           4          65             70      F      public   
4           5          50             60      F      public   

   school_type_public  
0                 1.0  
1                 0.0  
2                 0.0  
3                 1.0  
4                 1.0  


In [83]:
df_encoded = pd.concat([df, pd.DataFrame(le_gender_encode, columns=['gender_label']), encoded_df_ohe], axis = 1)
print(df_encoded)

   student_id  math_score  english_score gender school_type  gender_label  \
0           1          75             82      F      public             0   
1           2          88             79      M     private             1   
2           3          95             91      M     private             1   
3           4          65             70      F      public             0   
4           5          50             60      F      public             0   

   school_type_public  
0                 1.0  
1                 0.0  
2                 0.0  
3                 1.0  
4                 1.0  
