# Data Preprocessing Tools

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [48]:
path = r'/content/Data.csv'
data_set = pd.read_csv(path)
X=data_set.iloc[:,:-1]   #X=data_set.iloc[:,:-1] #type data frame if .values added it becomes nd.array
Y=data_set.iloc[:,-1]    #Y=data_set.iloc[:,-1] 

In [49]:
print(type(X))

<class 'pandas.core.frame.DataFrame'>


In [50]:
print(Y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


## Taking care of missing data

In [51]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy = 'mean')   
imputer.fit(X.iloc[:,1:])                                          #finds mean and missing values
X.iloc[:,1:] = imputer.transform(X.iloc[:,1:])                     #replace the missing values with the average of the salary
print(data_set)

#a=data_set.fillna(data_set.median(), inplace=True)                #to replce missing values with median
#data_set.isnull().sum()                                           #no of missing values
#num_missing = (dataset[[1,2,3,4,5]] == 0).sum()                   # 1,2,3,4,5 are column names
#dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)       #replace '0' values with 'nan' 
#dataset.dropna(inplace=True)                                      #drop rows with missing values 
#dataset.fillna(dataset.mean(), inplace=True)                      #filling missing values
#a=data_set.dropna(inplace=False).copy()                           #this is to drop missing rows
#a=data_set['Age'].fillna(data_set['Age'].median()).copy()         #this is to fill the missing values


   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0  61000.0       Yes
5   France  35.0  58000.0       Yes
6    Spain  38.0  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [43]:
print(a)

None


## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# transformers = [(encoding,which type, to which column)] | remaider is used for whether to keeo pr drop the other columns which are not transformed.
ct = ColumnTransformer(transformers=[('encoders',OneHotEncoder(),[0])],remainder='passthrough') 
X=np.array(ct.fit_transform(X))  #Fit transform won't repeat the output as numpy array
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
print(X)

[[ 1.000  0.000  0.000  44.000  72000.000]
 [ 0.000  0.000  1.000  27.000  48000.000]
 [ 0.000  1.000  0.000  30.000  54000.000]
 [ 0.000  0.000  1.000  38.000  61000.000]
 [ 0.000  1.000  0.000  40.000  63777.778]
 [ 1.000  0.000  0.000  35.000  58000.000]
 [ 0.000  0.000  1.000  38.778  52000.000]
 [ 1.000  0.000  0.000  48.000  79000.000]
 [ 0.000  1.000  0.000  50.000  83000.000]
 [ 1.000  0.000  0.000  37.000  67000.000]]


### Encoding the Dependent Variable

In [None]:
#Label encoders create 0 and 1 using yes and no's easily
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() #no need of extra parameters as we are using only 1 column
y = le.fit_transform(Y) #dependent variable need not be in array form

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
print(xtrain)

[[ 0.000  0.000  1.000  38.778  52000.000]
 [ 0.000  1.000  0.000  40.000  63777.778]
 [ 1.000  0.000  0.000  44.000  72000.000]
 [ 0.000  0.000  1.000  38.000  61000.000]
 [ 0.000  0.000  1.000  27.000  48000.000]
 [ 1.000  0.000  0.000  48.000  79000.000]
 [ 0.000  1.000  0.000  50.000  83000.000]
 [ 1.000  0.000  0.000  35.000  58000.000]]


In [None]:
print(xtest)

[[ 0.000  1.000  0.000  30.000  54000.000]
 [ 1.000  0.000  0.000  37.000  67000.000]]


## Feature Scaling

In [None]:
#should not feature scale before splitting as we are supposed to have new xtest values which we dont know so we cannot apply to whole data. they should be done differently
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
xtrain[:,3:] = sc.fit_transform(xtrain[:,3:]) #uses mean and standard deviation
xtest[:,3:] = sc.transform(xtest[:,3:])       #here we are using transform not fit_transform because for thenew x values we wont have the mean and standard deviation. we will use the mean and standard deviation from the xtrain intsel

In [None]:
print(xtrain)

[[ 0.000  0.000  1.000 -0.192 -1.078]
 [ 0.000  1.000  0.000 -0.014 -0.070]
 [ 1.000  0.000  0.000  0.567  0.634]
 [ 0.000  0.000  1.000 -0.305 -0.308]
 [ 0.000  0.000  1.000 -1.902 -1.420]
 [ 1.000  0.000  0.000  1.148  1.233]
 [ 0.000  1.000  0.000  1.438  1.575]
 [ 1.000  0.000  0.000 -0.740 -0.565]]


In [None]:
print(xtest)

[[ 0.000  1.000  0.000 -1.000 -1.000]
 [ 1.000  0.000  0.000  1.000  1.000]]
