**Data Cleaning**

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df=pd.read_csv('/content/CountryAgeSalary.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [7]:
X=df.iloc[:,:-1].values
y=df.iloc[:,3].values
type(X)

numpy.ndarray

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN,strategy='most_frequent')
X[:,1:2]=imputer.fit_transform(X[:,1:2])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' 27.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [9]:
imputer = SimpleImputer(missing_values=np.NaN,strategy='most_frequent')
df.Age = imputer.fit_transform(df['Age'].values.reshape(-1,1))[:,0]
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [10]:
imputer = SimpleImputer(missing_values=np.NaN,strategy='mean')
X[:,2:3]=imputer.fit_transform(X[:,2:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 27.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [11]:
imputer = SimpleImputer(missing_values=np.NaN,strategy='mean')
df.Salary= imputer.fit_transform(df['Salary'].values.reshape(-1,1))[:,0]
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [12]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X[:,0] = labelencoder.fit_transform(X[:,0])
print(X)

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 27.0 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [14]:
columnTransformer = ColumnTransformer([('encoder',OneHotEncoder(),[0])],remainder='passthrough')

In [15]:
X=np.array(columnTransformer.fit_transform(X),dtype = str)

In [16]:
print(X)

[['1.0' '0.0' '0.0' '44.0' '72000.0']
 ['0.0' '0.0' '1.0' '27.0' '48000.0']
 ['0.0' '1.0' '0.0' '30.0' '54000.0']
 ['0.0' '0.0' '1.0' '38.0' '61000.0']
 ['0.0' '1.0' '0.0' '40.0' '63777.77777777778']
 ['1.0' '0.0' '0.0' '35.0' '58000.0']
 ['0.0' '0.0' '1.0' '27.0' '52000.0']
 ['1.0' '0.0' '0.0' '48.0' '79000.0']
 ['0.0' '1.0' '0.0' '50.0' '83000.0']
 ['1.0' '0.0' '0.0' '37.0' '67000.0']]


Conclusion:   
Sklearn.impute class SimpleImputer can be used to impute/replace missing values for both
numerical and categorical features. For numerical missing values, a strategy such
as mean, median, most frequent, and constant can be used. For categorical features, a strategy
such as the most frequent and constant can be used. Categorical variables can be converted
into numerical using label encoding or one-hot encoding.


One-hot encoding creates separate binary columns for each category, losing order but capturing presence/absence without bias; this makes it the preferred choice for nominal data, whereas label encoding is more appropriate for ordinal data and data efficiency. Label encoding assigns unique numbers to categories, preserving order but possibly introducing false relationships.

In [17]:
import pandas as pd
import numpy as np
students = [[86,'M','very good'],[95,'F','excellent'],[75,None,'good'],[np.NaN,'M','average'],[71,'M','good'],[np.NaN,None,'very good'],[92,'F','very good'],[99,'M','excellent']]

In [18]:
dfstud = pd.DataFrame(students)
dfstud

Unnamed: 0,0,1,2
0,86.0,M,very good
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,71.0,M,good
5,,,very good
6,92.0,F,very good
7,99.0,M,excellent


In [19]:
dfstud.columns=['marks','gender','result']
dfstud

Unnamed: 0,marks,gender,result
0,86.0,M,very good
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,71.0,M,good
5,,,very good
6,92.0,F,very good
7,99.0,M,excellent


In [20]:
dfstud.isnull().values.sum()

4

In [21]:
X=dfstud.iloc[:,0:2].values
y=dfstud.iloc[:,2].values
type(X)

numpy.ndarray

In [22]:
from sklearn.impute import SimpleImputer
'''imputer = SimpleImputer(missing_values=np.NaN,strategy='mean')
X[:,0:1]=imputer.fit_transform(X[:,0:1])
print(X)'''

"imputer = SimpleImputer(missing_values=np.NaN,strategy='mean')\nX[:,0:1]=imputer.fit_transform(X[:,0:1])\nprint(X)"

In [23]:
'''imputer = SimpleImputer(missing_values=np.NaN,strategy='mean')
dfstud.marks = imputer.fit_transform(dfstud['marks'].values.reshape(-1,1))[:,0]
dfstud'''

"imputer = SimpleImputer(missing_values=np.NaN,strategy='mean')\ndfstud.marks = imputer.fit_transform(dfstud['marks'].values.reshape(-1,1))[:,0]\ndfstud"

In [24]:
'''imputer = SimpleImputer(missing_values=None,strategy='most_frequent')
X[:,1:2]=imputer.fit_transform(X[:,1:2])
print(X)'''

"imputer = SimpleImputer(missing_values=None,strategy='most_frequent')\nX[:,1:2]=imputer.fit_transform(X[:,1:2])\nprint(X)"

In [25]:
'''imputer = SimpleImputer(missing_values=None,strategy='most_frequent')
dfstud.gender = imputer.fit_transform(dfstud['gender'].values.reshape(-1,1))[:,0]
dfstud'''

"imputer = SimpleImputer(missing_values=None,strategy='most_frequent')\ndfstud.gender = imputer.fit_transform(dfstud['gender'].values.reshape(-1,1))[:,0]\ndfstud"

In [26]:
imputer = SimpleImputer(missing_values=np.NaN,strategy='constant',fill_value=80)
X[:,0:1]=imputer.fit_transform(X[:,0:1])
print(X)

[[86.0 'M']
 [95.0 'F']
 [75.0 None]
 [80 'M']
 [71.0 'M']
 [80 None]
 [92.0 'F']
 [99.0 'M']]


In [27]:
imputer = SimpleImputer(missing_values=np.NaN,strategy='constant',fill_value=80)
dfstud.marks = imputer.fit_transform(dfstud['marks'].values.reshape(-1,1))[:,0]
dfstud

Unnamed: 0,marks,gender,result
0,86.0,M,very good
1,95.0,F,excellent
2,75.0,,good
3,80.0,M,average
4,71.0,M,good
5,80.0,,very good
6,92.0,F,very good
7,99.0,M,excellent


In [28]:
imputer = SimpleImputer(missing_values=None,strategy='constant',fill_value='F')
dfstud.gender = imputer.fit_transform(dfstud['gender'].values.reshape(-1,1))[:,0]
dfstud

Unnamed: 0,marks,gender,result
0,86.0,M,very good
1,95.0,F,excellent
2,75.0,F,good
3,80.0,M,average
4,71.0,M,good
5,80.0,F,very good
6,92.0,F,very good
7,99.0,M,excellent


Conclusion:   
Sklearn.impute class SimpleImputer can be used to impute/replace missing values for both
numerical and categorical features. For numerical missing values, a strategy such
as mean, median, most frequent, and constant can be used. For categorical features, a strategy
such as the most frequent and constant can be used. Categorical variables can be converted
into numerical using label encoding or one-hot encoding.


One-hot encoding creates separate binary columns for each category, losing order but capturing presence/absence without bias; this makes it the preferred choice for nominal data, whereas label encoding is more appropriate for ordinal data and data efficiency. Label encoding assigns unique numbers to categories, preserving order but possibly introducing false relationships.


