# importing Necessary libraries

In [191]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [192]:
# Reading csv file
df = pd.read_csv("study_performance.csv")

In [193]:
df.sample(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
742,female,group C,high school,standard,none,81,84,82
977,male,group C,associate's degree,standard,none,62,65,58
0,female,group B,bachelor's degree,standard,none,72,72,74
729,male,group C,some college,standard,none,53,39,37
416,male,group C,bachelor's degree,standard,completed,71,74,68


In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [195]:
df.gender.value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [196]:
df.race_ethnicity.value_counts()

race_ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

In [197]:
df.lunch.value_counts()

lunch
standard        645
free/reduced    355
Name: count, dtype: int64

In [198]:
df.test_preparation_course.value_counts()

test_preparation_course
none         642
completed    358
Name: count, dtype: int64

In [199]:
df1 = df.copy()

In [200]:
df1.parental_level_of_education.unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

# ENCODING

In [201]:
from sklearn.preprocessing import OrdinalEncoder

In [202]:
from sklearn.model_selection import train_test_split

In [203]:
X_train,X_test,y_train,y_test = train_test_split(df1[['gender','race_ethnicity','parental_level_of_education','lunch','math_score','reading_score','writing_score']],
                                                 df1['test_preparation_course'],
                                                 test_size=3,
                                                 random_state = 0)

# ordinal encoding for ordinal features

#### "parental_level_of_education" and	"lunch" are my independent feautures beacause i am go with OrdinalEncoder

In [204]:
oe = OrdinalEncoder(categories=[['high school','some high school','some college',"bachelor's degree","associate's degree","master's degree"],
                                                
                            ['free/reduced','standard']])

# X_train

In [205]:
X_train

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,math_score,reading_score,writing_score
553,male,group D,some college,free/reduced,77,62,64
672,female,group C,some college,standard,69,78,76
971,male,group C,some high school,standard,78,72,69
27,female,group C,bachelor's degree,standard,67,69,75
231,male,group C,associate's degree,standard,46,43,42
...,...,...,...,...,...,...,...
835,female,group C,high school,standard,60,64,74
192,female,group B,some high school,standard,62,64,66
629,female,group C,some high school,standard,44,51,55
559,male,group D,some high school,standard,73,66,62


In [206]:
X_train_oe = X_train[['parental_level_of_education','lunch']]

# Training data

In [207]:
oe.fit(X_train_oe)

In [208]:
X_train_scaled_oe = oe.transform(X_train_oe)

###  After scaling the x_train data

In [209]:
X_train_scaled_oe

array([[2., 0.],
       [2., 1.],
       [1., 1.],
       ...,
       [1., 1.],
       [1., 1.],
       [2., 1.]])

# Testing data

# X_test

In [210]:
X_test

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,math_score,reading_score,writing_score
993,female,group D,bachelor's degree,free/reduced,62,72,74
859,male,group C,associate's degree,free/reduced,87,73,72
298,male,group C,high school,free/reduced,40,46,50


In [211]:
X_test_oe = X_test[['parental_level_of_education','lunch']]

In [212]:
# x_test data
oe.fit(X_test_oe)

In [213]:
X_test_scaled_oe=oe.transform(X_test_oe)

### After scaling the x_test data

In [214]:
X_test_scaled_oe

array([[3., 0.],
       [4., 0.],
       [0., 0.]])

# LabelEncoding

In [215]:
from sklearn.preprocessing import LabelEncoder

### LabelEncoder doing with independent feautures here "test_preparation_course" is independent  

In [216]:
le = LabelEncoder()

# y_train

In [217]:
# training the y_train data
le.fit(y_train)

In [218]:
le.classes_

array(['completed', 'none'], dtype=object)

In [219]:
# y_train_scaled=le.fit_transform(y_train)
y_train_scaled_le=le.transform(y_train)

In [220]:
# After scaling the y_train data
y_train_scaled_le

array([1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,

# y_test

In [221]:
y_test_scaled_le  = le.fit_transform(y_test)

In [222]:
# After scaling the y_test data
y_test_scaled_le

array([1, 1, 0])

# Nominal Encoding

### i found "gender" and "race_ethnicity" are nominal feautures 

In [223]:
from sklearn.preprocessing import OneHotEncoder

In [224]:
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)

# X_train

In [225]:
X_train_ne = X_train[['gender','race_ethnicity']]

In [251]:
df.race_ethnicity.value_counts()

race_ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

In [226]:
ohe.fit(X_train_ne)

In [227]:
X_train_scaled_ne = ohe.fit_transform(X_train_ne)

In [228]:
X_train_scaled_ne

array([[1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 1, 0, 0, 0]])

# X_test

In [229]:
X_test_ne = X_test[['gender','race_ethnicity']]

In [230]:
ohe.fit(X_test_ne)

In [231]:
X_test_scaled_ne = ohe.fit_transform(X_test_ne)

In [232]:
X_test_scaled_ne

array([[0, 1],
       [1, 0],
       [1, 0]])

# StandardScaling

In [233]:
from sklearn.preprocessing import StandardScaler

### scaling "math_score","reading_score","writing_score" beacause those are all numerical feautures 

# X_train

In [234]:
X_train

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,math_score,reading_score,writing_score
553,male,group D,some college,free/reduced,77,62,64
672,female,group C,some college,standard,69,78,76
971,male,group C,some high school,standard,78,72,69
27,female,group C,bachelor's degree,standard,67,69,75
231,male,group C,associate's degree,standard,46,43,42
...,...,...,...,...,...,...,...
835,female,group C,high school,standard,60,64,74
192,female,group B,some high school,standard,62,64,66
629,female,group C,some high school,standard,44,51,55
559,male,group D,some high school,standard,73,66,62


In [235]:
x_train_nm = X_train[['math_score','reading_score','writing_score']]

In [236]:
x_train_nm # x_train numerical feautures

Unnamed: 0,math_score,reading_score,writing_score
553,77,62,64
672,69,78,76
971,78,72,69
27,67,69,75
231,46,43,42
...,...,...,...
835,60,64,74
192,62,64,66
629,44,51,55
559,73,66,62


In [237]:
scaler = StandardScaler()

# Training

In [238]:
scaler.fit(x_train_nm)

In [239]:
X_train_scaled_nm = scaler.transform(x_train_nm)

# After standard scaling values

In [240]:
X_train_scaled_nm

array([[ 0.72002446, -0.49230952, -0.26727623],
       [ 0.19164879,  0.60391067,  0.52227755],
       [ 0.78607141,  0.1928281 ,  0.06170451],
       ...,
       [-1.45952515, -1.2459609 , -0.85944157],
       [ 0.45583663, -0.21825447, -0.39886853],
       [-0.27067991, -0.21825447, -0.00409164]])

# X_test

In [241]:
X_test

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,math_score,reading_score,writing_score
993,female,group D,bachelor's degree,free/reduced,62,72,74
859,male,group C,associate's degree,free/reduced,87,73,72
298,male,group C,high school,free/reduced,40,46,50


In [242]:
X_test_nm = X_test[['math_score','reading_score','writing_score']]

In [243]:
x_test_nm

Unnamed: 0,math_score,reading_score,writing_score
993,62,72,74
859,87,73,72
298,40,46,50


In [244]:
scaler.fit(x_test_nm)

# Training

In [245]:
X_test_scaled_nm = scaler.transform(x_test_nm)

# After standard scaling values

In [246]:
X_test_scaled_nm

array([[-0.05208145,  0.66672593,  0.79708114],
       [ 1.24995479,  0.74673305,  0.61313934],
       [-1.19787334, -1.41345898, -1.41022048]])

In [247]:
X_train_transformed = np.concatenate((X_train_scaled_oe,X_train_scaled_ne,X_train_scaled_nm),axis=1)

In [248]:
# also the test data
X_test_transformed = np.concatenate((X_test_scaled_oe,X_test_scaled_ne,X_test_scaled_nm),axis=1)

In [249]:
X_train_transformed.shape

(997, 10)

In [250]:
X_train_transformed 

array([[ 2.        ,  0.        ,  1.        , ...,  0.72002446,
        -0.49230952, -0.26727623],
       [ 2.        ,  1.        ,  0.        , ...,  0.19164879,
         0.60391067,  0.52227755],
       [ 1.        ,  1.        ,  1.        , ...,  0.78607141,
         0.1928281 ,  0.06170451],
       ...,
       [ 1.        ,  1.        ,  0.        , ..., -1.45952515,
        -1.2459609 , -0.85944157],
       [ 1.        ,  1.        ,  1.        , ...,  0.45583663,
        -0.21825447, -0.39886853],
       [ 2.        ,  1.        ,  1.        , ..., -0.27067991,
        -0.21825447, -0.00409164]])