# 02.2 Pre Processing - Fix Missing Inputs

In [59]:
import pandas
import numpy as np
from sklearn.impute import SimpleImputer

In [60]:
df = pandas.read_csv("02-pre-process-data.csv")
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Raw DataFrame observation

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


**Conclusion:**
* some missing values
    * total of 10 values in the Country column
    * total of <span style="color: red;">**9**</span> values in the Age column
    * total of <span style="color: red;">**9**</span> values in the Salary column
    * total of 10 values in the Purchased column
* Age and Salary columns need to be fixed

## Check for missing values

In [62]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

**Conclusion:**
* 2 missing values
    * one in the Age column
    * one in the Salary column

## More Raw DataFrame observation

In [63]:
# take all the rows, take all the columns except the last column one
X = df.iloc[:,:-1].values # will contains the values of the columns 'Country', 'Age', 'Salary'
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [64]:
# take all the rows, take only the last column
y = df.iloc[:, -1].values # will contains the values of the column 'Purchased'
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Up close observation

In [65]:
# rows with missing values

print(f"-" * 50)
print(f"first row (Salary value is Missing):")
print(X[4,:])

print(f"-" * 50)
print(f"second row (Age value is Missing):")
print(X[6,:])

--------------------------------------------------
first row (Salary value is Missing):
['Germany' 40.0 nan]
--------------------------------------------------
second row (Age value is Missing):
['Spain' nan 52000.0]


## Fixing the missing inputs

In [66]:
# replace all the NaN with the average (mean) value of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# all the rows, but only take the columns 'Age' + 'Salary'
imputer.fit(X[:, 1:3])

# all the rows, but only take the columns 'Age' + 'Salary'
X[:, 1:3] = imputer.transform(X[:, 1:3])

# only the columns 'Age' + 'Salary' got modified
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [67]:
# rows with missing values

print(f"-" * 50)
print(f"first row (Salary value was Missing):")
print(X[4,:])

print(f"-" * 50)
print(f"second row (Age value was Missing):")
print(X[6,:])

--------------------------------------------------
first row (Salary value was Missing):
['Germany' 40.0 63777.77777777778]
--------------------------------------------------
second row (Age value was Missing):
['Spain' 38.77777777777778 52000.0]
