# Data Preprocessing 

Data preprocessing is the process of generating raw data for machine learning models. This is the first step in creating a machine-learning model. This is the most complex and time-consuming aspect of data science. Data preprocessing is required in machine learning algorithms to reduce its complexities.


## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('gender.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
dataset.head(5)

Unnamed: 0,Age,Height (cm),Education Level,Gender
0,35.0,178.0,Master's Degree,male
1,28.0,165.0,Bachelor's Degree,female
2,42.0,185.0,Doctorate Degree,male
3,31.0,163.0,Associate's Degree,female
4,30.0,182.0,Bachelor's Degree,male


In [3]:
print(X)

[[35.0 178.0 "Master's Degree"]
 [28.0 165.0 "Bachelor's Degree"]
 [42.0 185.0 'Doctorate Degree']
 [31.0 163.0 "Associate's Degree"]
 [30.0 182.0 "Bachelor's Degree"]
 [nan 170.0 "Bachelor's Degree"]
 [44.0 188.0 "Master's Degree"]
 [29.0 167.0 "Bachelor's Degree"]
 [37.0 179.0 "Bachelor's Degree"]
 [26.0 162.0 "Bachelor's Degree"]
 [43.0 nan "Bachelor's Degree"]
 [34.0 168.0 "Associate's Degree"]
 [36.0 174.0 "Bachelor's Degree"]
 [27.0 166.0 "Bachelor's Degree"]
 [41.0 180.0 'Doctorate Degree']
 [30.0 170.0 "Bachelor's Degree"]
 [38.0 nan "Bachelor's Degree"]
 [29.0 164.0 'Doctorate Degree']
 [40.0 182.0 "Master's Degree"]
 [33.0 169.0 "Bachelor's Degree"]
 [39.0 181.0 "Bachelor's Degree"]
 [32.0 168.0 "Bachelor's Degree"]
 [nan 186.0 "Master's Degree"]
 [28.0 163.0 "Associate's Degree"]
 [34.0 177.0 "Bachelor's Degree"]
 [31.0 166.0 "Bachelor's Degree"]
 [42.0 184.0 "Bachelor's Degree"]
 [30.0 170.0 "Associate's Degree"]
 [37.0 179.0 "Bachelor's Degree"]
 [37.0 175.0 "Bachelor's De

In [4]:
print(y)

['male' 'female' 'male' 'female' 'male' 'female' 'male' 'female' 'male'
 'female' 'male' 'female' 'male' 'female' 'male' 'female' 'male' 'female'
 'male' 'female' 'male' 'female' 'male' 'female' 'male' 'female' 'male'
 'female' 'male' 'male' 'female' 'male' 'female' 'male' 'female' 'male'
 'female' 'male' 'female' 'male' 'female' 'male' 'female' 'male' 'female'
 'male' 'female' 'male' 'female' 'male' 'female' 'male' 'female' 'male'
 'male' 'female' 'male' 'female' 'male' 'female' 'male' 'female' 'male'
 'female' 'male' 'female' 'male' 'female' 'male' 'female' 'male' 'female'
 'male' 'female' 'male' 'female' 'male' 'female' 'male' 'female']


In [5]:
np.unique(y)

array(['female', 'male'], dtype=object)

## Taking care of missing data

In [6]:
X_null_values = pd.isnull(X)

In [7]:
print(X_null_values)

[[False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [ True False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False  True False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False  True False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [ True False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False

In [8]:
y_null_values = pd.isnull(y)

In [9]:
print(y_null_values)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False]


In [24]:
#if there any null values use this method
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 0:2])#0:2 = Age and Height
X[:, 0:2] = imputer.transform(X[:, 0:2])

In [11]:
print(X)

[[35.0 178.0 "Master's Degree"]
 [28.0 165.0 "Bachelor's Degree"]
 [42.0 185.0 'Doctorate Degree']
 [31.0 163.0 "Associate's Degree"]
 [30.0 182.0 "Bachelor's Degree"]
 [34.52564102564103 170.0 "Bachelor's Degree"]
 [44.0 188.0 "Master's Degree"]
 [29.0 167.0 "Bachelor's Degree"]
 [37.0 179.0 "Bachelor's Degree"]
 [26.0 162.0 "Bachelor's Degree"]
 [43.0 173.07692307692307 "Bachelor's Degree"]
 [34.0 168.0 "Associate's Degree"]
 [36.0 174.0 "Bachelor's Degree"]
 [27.0 166.0 "Bachelor's Degree"]
 [41.0 180.0 'Doctorate Degree']
 [30.0 170.0 "Bachelor's Degree"]
 [38.0 173.07692307692307 "Bachelor's Degree"]
 [29.0 164.0 'Doctorate Degree']
 [40.0 182.0 "Master's Degree"]
 [33.0 169.0 "Bachelor's Degree"]
 [39.0 181.0 "Bachelor's Degree"]
 [32.0 168.0 "Bachelor's Degree"]
 [34.52564102564103 186.0 "Master's Degree"]
 [28.0 163.0 "Associate's Degree"]
 [34.0 177.0 "Bachelor's Degree"]
 [31.0 166.0 "Bachelor's Degree"]
 [42.0 184.0 "Bachelor's Degree"]
 [30.0 170.0 "Associate's Degree"]
 [3

## Encoding categorical data

### Encoding the Independent Variable

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X = ct.fit_transform(X)

In [13]:
print(X)

[[0.0 0.0 0.0 1.0 35.0 178.0]
 [0.0 1.0 0.0 0.0 28.0 165.0]
 [0.0 0.0 1.0 0.0 42.0 185.0]
 [1.0 0.0 0.0 0.0 31.0 163.0]
 [0.0 1.0 0.0 0.0 30.0 182.0]
 [0.0 1.0 0.0 0.0 34.52564102564103 170.0]
 [0.0 0.0 0.0 1.0 44.0 188.0]
 [0.0 1.0 0.0 0.0 29.0 167.0]
 [0.0 1.0 0.0 0.0 37.0 179.0]
 [0.0 1.0 0.0 0.0 26.0 162.0]
 [0.0 1.0 0.0 0.0 43.0 173.07692307692307]
 [1.0 0.0 0.0 0.0 34.0 168.0]
 [0.0 1.0 0.0 0.0 36.0 174.0]
 [0.0 1.0 0.0 0.0 27.0 166.0]
 [0.0 0.0 1.0 0.0 41.0 180.0]
 [0.0 1.0 0.0 0.0 30.0 170.0]
 [0.0 1.0 0.0 0.0 38.0 173.07692307692307]
 [0.0 0.0 1.0 0.0 29.0 164.0]
 [0.0 0.0 0.0 1.0 40.0 182.0]
 [0.0 1.0 0.0 0.0 33.0 169.0]
 [0.0 1.0 0.0 0.0 39.0 181.0]
 [0.0 1.0 0.0 0.0 32.0 168.0]
 [0.0 0.0 0.0 1.0 34.52564102564103 186.0]
 [1.0 0.0 0.0 0.0 28.0 163.0]
 [0.0 1.0 0.0 0.0 34.0 177.0]
 [0.0 1.0 0.0 0.0 31.0 166.0]
 [0.0 1.0 0.0 0.0 42.0 184.0]
 [1.0 0.0 0.0 0.0 30.0 170.0]
 [0.0 1.0 0.0 0.0 37.0 179.0]
 [0.0 1.0 0.0 0.0 37.0 175.0]
 [0.0 1.0 0.0 0.0 27.0 160.0]
 [0.0 0.0 1.0 0.0 

### Encoding the Dependent Variable

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
print(y)

[1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0
 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
 1 0 1 0 1 0]


## Splitting the dataset into the Training set and Test set

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [17]:
print(X_train)

[[0.0 1.0 0.0 0.0 28.0 162.0]
 [0.0 0.0 1.0 0.0 42.0 185.0]
 [0.0 0.0 0.0 1.0 43.0 183.0]
 [0.0 1.0 0.0 0.0 39.0 178.0]
 [0.0 1.0 0.0 0.0 35.0 177.0]
 [0.0 1.0 0.0 0.0 30.0 170.0]
 [0.0 1.0 0.0 0.0 35.0 178.0]
 [0.0 0.0 1.0 0.0 44.0 182.0]
 [0.0 1.0 0.0 0.0 33.0 169.0]
 [1.0 0.0 0.0 0.0 29.0 160.0]
 [1.0 0.0 0.0 0.0 27.0 162.0]
 [0.0 1.0 0.0 0.0 28.0 162.0]
 [0.0 1.0 0.0 0.0 42.0 184.0]
 [0.0 1.0 0.0 0.0 33.0 175.0]
 [0.0 1.0 0.0 0.0 33.0 169.0]
 [0.0 1.0 0.0 0.0 34.0 168.0]
 [0.0 1.0 0.0 0.0 30.0 163.0]
 [0.0 0.0 0.0 1.0 42.0 183.0]
 [0.0 1.0 0.0 0.0 29.0 164.0]
 [0.0 1.0 0.0 0.0 32.0 168.0]
 [0.0 1.0 0.0 0.0 37.0 180.0]
 [1.0 0.0 0.0 0.0 31.0 163.0]
 [0.0 1.0 0.0 0.0 31.0 167.0]
 [0.0 1.0 0.0 0.0 32.0 178.0]
 [0.0 0.0 1.0 0.0 42.0 184.0]
 [0.0 1.0 0.0 0.0 37.0 179.0]
 [0.0 1.0 0.0 0.0 34.0 177.0]
 [0.0 1.0 0.0 0.0 37.0 179.0]
 [0.0 0.0 1.0 0.0 29.0 164.0]
 [0.0 0.0 0.0 1.0 35.0 178.0]
 [0.0 0.0 1.0 0.0 29.0 165.0]
 [1.0 0.0 0.0 0.0 28.0 163.0]
 [0.0 1.0 0.0 0.0 29.0 167.0]
 [0.0 1.0 

In [18]:
print(X_test)

[[0.0 1.0 0.0 0.0 31.0 166.0]
 [1.0 0.0 0.0 0.0 30.0 170.0]
 [0.0 0.0 1.0 0.0 44.0 182.0]
 [0.0 1.0 0.0 0.0 34.0 167.0]
 [0.0 0.0 1.0 0.0 29.0 166.0]
 [0.0 0.0 0.0 1.0 40.0 183.0]
 [0.0 1.0 0.0 0.0 34.0 176.0]
 [0.0 0.0 0.0 1.0 43.0 184.0]
 [0.0 1.0 0.0 0.0 41.0 181.0]
 [0.0 1.0 0.0 0.0 31.0 168.0]
 [0.0 1.0 0.0 0.0 29.0 165.0]
 [0.0 1.0 0.0 0.0 36.0 175.0]
 [1.0 0.0 0.0 0.0 31.0 170.0]
 [0.0 1.0 0.0 0.0 33.0 170.0]
 [0.0 1.0 0.0 0.0 38.0 179.0]
 [0.0 1.0 0.0 0.0 43.0 173.07692307692307]]


In [19]:
print(y_train)

[0 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1
 0 0 0 1 1 0 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 0 0 0 1 1 1]


In [20]:
print(y_test)

[0 0 1 0 0 1 1 1 1 0 0 1 0 0 1 1]


## Feature Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 4:] = sc.fit_transform(X_train[:, 4:]) #[:, 4:] = without OneHotEncoded rows
X_test[:, 4:] = sc.transform(X_test[:, 4:])

In [22]:
print(X_train)

[[0.0 1.0 0.0 0.0 -1.2392229929341387 -1.3517522853374644]
 [0.0 0.0 1.0 0.0 1.5156219886226097 1.478349654730117]
 [0.0 0.0 0.0 1.0 1.7123966301623774 1.2322538338546751]
 [0.0 1.0 0.0 0.0 0.9252980640033064 0.6170142816660704]
 [0.0 1.0 0.0 0.0 0.1381994978442354 0.4939663712283495]
 [0.0 1.0 0.0 0.0 -0.8456737098546033 -0.367369001835697]
 [0.0 1.0 0.0 0.0 0.1381994978442354 0.6170142816660704]
 [0.0 0.0 1.0 0.0 1.9091712717021452 1.1092059234169542]
 [0.0 1.0 0.0 0.0 -0.25534978523530005 -0.49041691227341794]
 [1.0 0.0 0.0 0.0 -1.042448351394371 -1.5978481062129064]
 [1.0 0.0 0.0 0.0 -1.4359976344739065 -1.3517522853374644]
 [0.0 1.0 0.0 0.0 -1.2392229929341387 -1.3517522853374644]
 [0.0 1.0 0.0 0.0 1.5156219886226097 1.355301744292396]
 [0.0 1.0 0.0 0.0 -0.25534978523530005 0.24787055035290764]
 [0.0 1.0 0.0 0.0 -0.25534978523530005 -0.49041691227341794]
 [0.0 1.0 0.0 0.0 -0.05857514369553233 -0.6134648227111389]
 [0.0 1.0 0.0 0.0 -0.8456737098546033 -1.2287043748997435]
 [0.0 0.0

In [23]:
print(X_test)

[[0.0 1.0 0.0 0.0 -0.6488990683148356 -0.8595606435865807]
 [1.0 0.0 0.0 0.0 -0.8456737098546033 -0.367369001835697]
 [0.0 0.0 1.0 0.0 1.9091712717021452 1.1092059234169542]
 [0.0 1.0 0.0 0.0 -0.05857514369553233 -0.7365127331488598]
 [0.0 0.0 1.0 0.0 -1.042448351394371 -0.8595606435865807]
 [0.0 0.0 0.0 1.0 1.1220727055430741 1.2322538338546751]
 [0.0 1.0 0.0 0.0 -0.05857514369553233 0.37091846079062857]
 [0.0 0.0 0.0 1.0 1.7123966301623774 1.355301744292396]
 [0.0 1.0 0.0 0.0 1.318847347082842 0.9861580129792332]
 [0.0 1.0 0.0 0.0 -0.6488990683148356 -0.6134648227111389]
 [0.0 1.0 0.0 0.0 -1.042448351394371 -0.9826085540243017]
 [0.0 1.0 0.0 0.0 0.33497413938400317 0.24787055035290764]
 [1.0 0.0 0.0 0.0 -0.6488990683148356 -0.367369001835697]
 [0.0 1.0 0.0 0.0 -0.25534978523530005 -0.367369001835697]
 [0.0 1.0 0.0 0.0 0.7285234224635386 0.7400621921037913]
 [0.0 1.0 0.0 0.0 1.7123966301623774 0.011239953357289126]]
