# **This Experiment is about applying the Data pre-processing techniques.**

# **Importing the Libraries**


In [2]:
import numpy as np #importing the numpy
import pandas as pd #importing the pandas
import matplotlib.pyplot as plt #importing the matplotlib


# **Importing the DataSet**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Preprocessing.csv')

In [5]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [6]:
X=data.iloc[:,:-1].values #in iloc the first one is presenting all the rows and the second one is presenting the columns
y=data.iloc[:,3].values

In [7]:
print(X) #X Data in the dataset(all rows)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [8]:
print(y)  #Y Data in the dataset(columns)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# **Taking Care of the missing Data**

In [9]:
data.duplicated() #seeing whether there are any duplicates

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,False


In [10]:
data=data.drop_duplicates() #if there any duplicates,drop it
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [11]:
data.isnull().sum() #checking whether are any nan in the dataset

Unnamed: 0,0
Country,0
Age,1
Salary,1
Purchased,0


In [12]:
from sklearn.impute import SimpleImputer
 #The SimpleImputer from the sklearn.impute module is used to handle missing data in datasets by replacing missing values with meaningful statistics or constants.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #here we are using the mean strategy and replace the null with the mean values calculated
imputer.fit(X[:,1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [13]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# **Encoding the Categorical Data**

Means Encoding categorical data refers to the process of transforming non-numeric (categorical) data into a numerical format that machine learning models can process.

# Encoding on the Independent Variables

In [14]:
from sklearn.compose import ColumnTransformer # here ColumnTranformer is used to convert the specific colum of variables into numeric
from sklearn.preprocessing import OneHotEncoder #we used the OneHotEncoder for the nominal data(Nominal data represents categories or labels without any inherent order or ranking.)
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough') #encoder is the name of the transformer and [0] is specified for which column should be converted to numeric
X=np.array(ct.fit_transform(X))


In [15]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


# Encoding on the Dependent Variables

In [19]:
from sklearn.preprocessing import LabelEncoder
# LabelEncoder is used for the ordinal data(Ordinal data represents categories with a meaningful order or ranking, but the differences between the ranks are not precisely defined.)
Le=LabelEncoder()
y=Le.fit_transform(y)

In [21]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# **Splitting the DataSet into Training and Testing**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1) #Specifies that 20% of the data will be used for testing
#we use the random_state as when we split a dataset or initialize random processes the results might vary slightly each time due to randomness.
#By specifying a random_state value, you ensure that the process produces the same result every time you run the code, making debugging and collaboration easier

In [24]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [25]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [26]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [27]:
print(y_test)

[0 1]


# **Feature Scaling**

Feature scaling is a preprocessing technique used in machine learning to standardize or normalize the range of independent variables (features) in the dataset. It ensures that all features contribute equally to the model's learning process, preventing features with larger ranges from dominating those with smaller ranges.

In [29]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])
#When you scale the test set, it’s crucial that the scaling is done based on the training data. This means that the scaling parameters are calculated from the X_train set and applied to X_test

In [30]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [31]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
