In [16]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
#Loading the data
data = {
    'age': [25, np.nan, 35, 45, 55, 65, 75, 85, 95, 105],
    'salary': [50000, 60000, np.nan, 80000, 90000, 100000, 110000, 120000, np.nan, 140000],
    'gender': ['Male', 'Female', np.nan, 'Female', 'Male', 'Female', 'Male', 'Male', 'Female', 'Male'],
    'purchased': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
}

In [18]:
df = pd.DataFrame(data)
df

Unnamed: 0,age,salary,gender,purchased
0,25.0,50000.0,Male,No
1,,60000.0,Female,Yes
2,35.0,,,No
3,45.0,80000.0,Female,No
4,55.0,90000.0,Male,Yes
5,65.0,100000.0,Female,Yes
6,75.0,110000.0,Male,No
7,85.0,120000.0,Male,Yes
8,95.0,,Female,No
9,105.0,140000.0,Male,Yes


In [19]:
# Handling missing values
df.isnull().sum()

age          1
salary       2
gender       1
purchased    0
dtype: int64

In [20]:
# Filling the numerical values with mean value
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])
df['salary'] = imputer.fit_transform(df[['salary']])

In [21]:
df

Unnamed: 0,age,salary,gender,purchased
0,25.0,50000.0,Male,No
1,65.0,60000.0,Female,Yes
2,35.0,93750.0,,No
3,45.0,80000.0,Female,No
4,55.0,90000.0,Male,Yes
5,65.0,100000.0,Female,Yes
6,75.0,110000.0,Male,No
7,85.0,120000.0,Male,Yes
8,95.0,93750.0,Female,No
9,105.0,140000.0,Male,Yes


In [22]:
#Filling the categorical values with most frequent values
imputer = SimpleImputer(strategy='most_frequent')
df['gender']= imputer.fit_transform(df[['gender']])
df

Unnamed: 0,age,salary,gender,purchased
0,25.0,50000.0,Male,No
1,65.0,60000.0,Female,Yes
2,35.0,93750.0,Male,No
3,45.0,80000.0,Female,No
4,55.0,90000.0,Male,Yes
5,65.0,100000.0,Female,Yes
6,75.0,110000.0,Male,No
7,85.0,120000.0,Male,Yes
8,95.0,93750.0,Female,No
9,105.0,140000.0,Male,Yes


In [23]:
#Encoding the Categorical data
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df['gender'] = label.fit_transform(df['gender'])
df['purchased'] = label.fit_transform(df['purchased'])

In [24]:
df

Unnamed: 0,age,salary,gender,purchased
0,25.0,50000.0,1,0
1,65.0,60000.0,0,1
2,35.0,93750.0,1,0
3,45.0,80000.0,0,0
4,55.0,90000.0,1,1
5,65.0,100000.0,0,1
6,75.0,110000.0,1,0
7,85.0,120000.0,1,1
8,95.0,93750.0,0,0
9,105.0,140000.0,1,1


In [25]:
#One Hot encoding
from sklearn.preprocessing import OneHotEncoder
df = pd.get_dummies(df, columns=['gender'],drop_first=True)


In [26]:
df

Unnamed: 0,age,salary,purchased,gender_1
0,25.0,50000.0,0,1
1,65.0,60000.0,1,0
2,35.0,93750.0,0,1
3,45.0,80000.0,0,0
4,55.0,90000.0,1,1
5,65.0,100000.0,1,0
6,75.0,110000.0,0,1
7,85.0,120000.0,1,1
8,95.0,93750.0,0,0
9,105.0,140000.0,1,1


In [27]:
# Feature Scaling
#Method 1 : Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['age', 'salary']] = scaler.fit_transform(df[['age', 'salary']])
df

Unnamed: 0,age,salary,purchased,gender_1
0,-1.632993,-1.731062,0,1
1,0.0,-1.335391,1,0
2,-1.224745,0.0,0,1
3,-0.816497,-0.544048,0,0
4,-0.408248,-0.148377,1,1
5,0.0,0.247295,1,0
6,0.408248,0.642966,0,1
7,0.816497,1.038637,1,1
8,1.224745,0.0,0,0
9,1.632993,1.82998,1,1


In [28]:
#Method 2 : Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['age', 'salary']] = scaler.fit_transform(df[['age', 'salary']])
df

Unnamed: 0,age,salary,purchased,gender_1
0,0.0,0.0,0,1
1,0.5,0.111111,1,0
2,0.125,0.486111,0,1
3,0.25,0.333333,0,0
4,0.375,0.444444,1,1
5,0.5,0.555556,1,0
6,0.625,0.666667,0,1
7,0.75,0.777778,1,1
8,0.875,0.486111,0,0
9,1.0,1.0,1,1


In [29]:
#Splitting the data
X = df.drop('purchased',axis=1)
y = df['purchased']

In [30]:
X

Unnamed: 0,age,salary,gender_1
0,0.0,0.0,1
1,0.5,0.111111,0
2,0.125,0.486111,1
3,0.25,0.333333,0
4,0.375,0.444444,1
5,0.5,0.555556,0
6,0.625,0.666667,1
7,0.75,0.777778,1
8,0.875,0.486111,0
9,1.0,1.0,1


In [31]:
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: purchased, dtype: int32

In [32]:
#Split the data in train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/4,random_state=1)

In [33]:
X_train

Unnamed: 0,age,salary,gender_1
4,0.375,0.444444,1
0,0.0,0.0,1
3,0.25,0.333333,0
1,0.5,0.111111,0
7,0.75,0.777778,1
8,0.875,0.486111,0
5,0.5,0.555556,0


In [34]:
X_test

Unnamed: 0,age,salary,gender_1
2,0.125,0.486111,1
9,1.0,1.0,1
6,0.625,0.666667,1


In [35]:
y_train

4    1
0    0
3    0
1    1
7    1
8    0
5    1
Name: purchased, dtype: int32

In [36]:
y_test

2    0
9    1
6    0
Name: purchased, dtype: int32

# Ex 2 : Data.csv

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [45]:
dataset = pd.read_csv('Data.csv')

In [57]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [58]:
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [59]:
# Handling missing data with mean values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')  
 
imputer = imputer.fit(x[:, 1:3])  

x[:, 1:3] = imputer.transform(x[:, 1:3]) 

In [60]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [61]:
# Encoding categorical data for Country Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

label_encoder_x = LabelEncoder()  
x[:, 0] = label_encoder_x.fit_transform(x[:, 0])  

In [62]:
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [63]:
# Creating a ColumnTransformer to apply OneHotEncoder
column_transformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = column_transformer.fit_transform(x)

In [64]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [53]:
# Encoding for purchased variable
labelencoder_y = LabelEncoder()  
y = labelencoder_y.fit_transform(y)  

In [54]:
# Splitting the dataset into training and test set
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) 

In [55]:
# Feature Scaling of datasets
from sklearn.preprocessing import StandardScaler  
st_x = StandardScaler()  
x_train = st_x.fit_transform(x_train)  
x_test = st_x.transform(x_test) 

In [56]:
# Displaying the final processed data
print("X_train:\n", x_train)
print("X_test:\n", x_test)
print("y_train:\n", y_train)
print("y_test:\n", y_test)


X_train:
 [[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
X_test:
 [[-1.          2.64575131 -0.77459667 -1.45882927 -0.90166297]
 [-1.          2.64575131 -0.77459667  1.98496442  2.13981082]]
y_train:
 [1 1 1 0 1 0 0 1]
y_test:
 [0 0]


# Practice Application 1

In [67]:
df = pd.read_csv('D:/Test/Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [68]:
#Independent variable
x = df.iloc[:, :-1].values

#Dependent variable 
y = df.iloc[:,-1].values

In [69]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [70]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [71]:
#Handling missing values

In [72]:
#Numerical data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[:,1:3]= imputer.fit_transform(x[:,1:3])

In [73]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [74]:
#Categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label = LabelEncoder()
x[:,0] = label.fit_transform(x[:,0])

In [75]:
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [77]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([
    ('encoder',OneHotEncoder(),[0])   
],remainder='passthrough')

x = ct.fit_transform(x)

In [78]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [79]:
label = LabelEncoder()
y = label.fit_transform(y)

In [80]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [81]:
#Data spliting
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=0)

In [86]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X_train = st.fit_transform(X_train)
X_test = st.fit_transform(X_test)

In [87]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [89]:
X_test

array([[ 0.,  0.,  0., -1., -1.],
       [ 0.,  0.,  0.,  1.,  1.]])

In [90]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [91]:
y_test

array([0, 0])

# Practice Application 2

In [106]:
data = {
'client_id': [46109, 46109, 46109],
'loan_type': ['home', 'credit', 'home'],
'loan_amount': [13672, 9794, 12734],
'repaid': [0, 0, 1],
'loan_id': [10243, 10984, 10990],
'loan_start': ['2002-04-16', '2003-10-21', '2006-02-01'],
'loan_end': ['2003-12-20', '2005-07-17', '2007-07-05'],
'rate': [2.15, 1.25, 0.68]
}

In [107]:
df = pd.DataFrame(data)
df

Unnamed: 0,client_id,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
0,46109,home,13672,0,10243,2002-04-16,2003-12-20,2.15
1,46109,credit,9794,0,10984,2003-10-21,2005-07-17,1.25
2,46109,home,12734,1,10990,2006-02-01,2007-07-05,0.68


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   client_id    3 non-null      int64  
 1   loan_type    3 non-null      object 
 2   loan_amount  3 non-null      int64  
 3   repaid       3 non-null      int64  
 4   loan_id      3 non-null      int64  
 5   loan_start   3 non-null      object 
 6   loan_end     3 non-null      object 
 7   rate         3 non-null      float64
dtypes: float64(1), int64(4), object(3)
memory usage: 320.0+ bytes


In [109]:
#Handling Missing values
from sklearn.impute import SimpleImputer


In [110]:
#Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    transformers=[
        ('loan_type',OneHotEncoder(),['loan_type'])
    ], remainder='passthrough')
df = ct.fit_transform(df)

In [111]:
df

array([[0.0, 1.0, 46109, 13672, 0, 10243, '2002-04-16', '2003-12-20',
        2.15],
       [1.0, 0.0, 46109, 9794, 0, 10984, '2003-10-21', '2005-07-17',
        1.25],
       [0.0, 1.0, 46109, 12734, 1, 10990, '2006-02-01', '2007-07-05',
        0.68]], dtype=object)

In [112]:
ecolumns = ct.get_feature_names_out()
ecolumns

array(['loan_type__loan_type_credit', 'loan_type__loan_type_home',
       'remainder__client_id', 'remainder__loan_amount',
       'remainder__repaid', 'remainder__loan_id', 'remainder__loan_start',
       'remainder__loan_end', 'remainder__rate'], dtype=object)

In [113]:
# Convert into DataFrame
#ecolumns = ct.get_feature_names_out()
df1 = pd.DataFrame(df,columns=ecolumns)

In [114]:
df1

Unnamed: 0,loan_type__loan_type_credit,loan_type__loan_type_home,remainder__client_id,remainder__loan_amount,remainder__repaid,remainder__loan_id,remainder__loan_start,remainder__loan_end,remainder__rate
0,0.0,1.0,46109,13672,0,10243,2002-04-16,2003-12-20,2.15
1,1.0,0.0,46109,9794,0,10984,2003-10-21,2005-07-17,1.25
2,0.0,1.0,46109,12734,1,10990,2006-02-01,2007-07-05,0.68


In [119]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   loan_type__loan_type_credit  3 non-null      object
 1   loan_type__loan_type_home    3 non-null      object
 2   remainder__client_id         3 non-null      object
 3   remainder__loan_amount       3 non-null      object
 4   remainder__repaid            3 non-null      object
 5   remainder__loan_id           3 non-null      object
 6   remainder__loan_start        3 non-null      object
 7   remainder__loan_end          3 non-null      object
 8   remainder__rate              3 non-null      object
dtypes: object(9)
memory usage: 344.0+ bytes


In [121]:
df1['remainder__loan_start'] = pd.to_datetime(df1['remainder__loan_start'])
df1['remainder__loan_end'] = pd.to_datetime(df1['remainder__loan_end'])

In [122]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   loan_type__loan_type_credit  3 non-null      object        
 1   loan_type__loan_type_home    3 non-null      object        
 2   remainder__client_id         3 non-null      object        
 3   remainder__loan_amount       3 non-null      object        
 4   remainder__repaid            3 non-null      object        
 5   remainder__loan_id           3 non-null      object        
 6   remainder__loan_start        3 non-null      datetime64[ns]
 7   remainder__loan_end          3 non-null      datetime64[ns]
 8   remainder__rate              3 non-null      object        
dtypes: datetime64[ns](2), object(7)
memory usage: 344.0+ bytes


In [125]:
df1['loan_duration'] = (df1['remainder__loan_end'] - df1['remainder__loan_start']).dt.days

In [126]:
df1['loan_duration']

0    613
1    635
2    519
Name: loan_duration, dtype: int64

In [129]:
df1

Unnamed: 0,loan_type__loan_type_credit,loan_type__loan_type_home,remainder__client_id,remainder__loan_amount,remainder__repaid,remainder__loan_id,remainder__loan_start,remainder__loan_end,remainder__rate,loan_duration
0,0.0,1.0,46109,13672,0,10243,2002-04-16,2003-12-20,2.15,613
1,1.0,0.0,46109,9794,0,10984,2003-10-21,2005-07-17,1.25,635
2,0.0,1.0,46109,12734,1,10990,2006-02-01,2007-07-05,0.68,519


In [130]:
df1 = df1.drop(columns= df1[['remainder__loan_start','remainder__loan_end']])

In [135]:
df1

array([[-0.70710678,  0.70710678,  0.        ,  0.97174366, -0.70710678,
        -1.41417907,  1.30547308,  0.47708281],
       [ 1.41421356, -1.41421356,  0.        , -1.37569524, -0.70710678,
         0.69853603, -0.18177473,  0.91440872],
       [-0.70710678,  0.70710678,  0.        ,  0.40395158,  1.41421356,
         0.71564304, -1.12369835, -1.39149153]])

In [132]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
df1 = st.fit_transform(df1)

In [133]:
df1

array([[-0.70710678,  0.70710678,  0.        ,  0.97174366, -0.70710678,
        -1.41417907,  1.30547308,  0.47708281],
       [ 1.41421356, -1.41421356,  0.        , -1.37569524, -0.70710678,
         0.69853603, -0.18177473,  0.91440872],
       [-0.70710678,  0.70710678,  0.        ,  0.40395158,  1.41421356,
         0.71564304, -1.12369835, -1.39149153]])