## Data Normalization Techniques

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('loan.csv')
df=df[['disbursed_amount','interest']]
df.head()

Unnamed: 0,disbursed_amount,interest
0,23201.5,15.484
1,7425.0,11.2032
2,11150.0,8.51
3,7600.0,5.8656
4,31960.0,18.7392


### Normalizing data using Sklearn

In [3]:
from sklearn import preprocessing
d = preprocessing.normalize(df)
names=['disbursed_amount','interest']
scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Unnamed: 0,disbursed_amount,interest
0,1.0,0.000667
1,0.999999,0.001509
2,1.0,0.000763
3,1.0,0.000772
4,1.0,0.000586


### Min Max Normalization

In [4]:
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(df)
scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Unnamed: 0,disbursed_amount,interest
0,0.653901,0.461951
1,0.191112,0.276143
2,0.300381,0.159244
3,0.196245,0.044464
4,0.910824,0.603243


### Z-Score Normalization

In [5]:
import scipy.stats as stats

d=df.apply(stats.zscore)
d.head()

Unnamed: 0,disbursed_amount,interest
0,1.125506,0.665558
1,-0.832384,-0.350106
2,-0.370105,-0.989096
3,-0.810666,-1.616507
4,2.212451,1.437888


## To see if Classification Improves with Normalization

### Without Normalization

In [6]:
data=pd.read_csv('TrainT.csv')
data=data[['Survived','Pclass','Sex','Age','Fare']]

data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [7]:
data['Sex']=data['Sex'].apply(lambda sex:1 if sex=="male" else 0)
data['Age']=data['Age'].fillna(data['Age'].median())

In [8]:
X=data.drop('Survived', axis=1)
Y=data['Survived']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.3, random_state=25)

In [10]:
from sklearn.linear_model import LogisticRegression
logit=LogisticRegression()
logit.fit(X_train,Y_train)


LogisticRegression()

In [11]:
Y_pred=logit.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
cm=confusion_matrix(Y_test,Y_pred)
report=classification_report(Y_test,Y_pred)
print(report)
print("Without Data Normalization")
print()
print("Accuracy score",accuracy_score(Y_test,Y_pred))
print()
print("Confusion Matrix")
print(cm)
print()
print("Classification Report")
print(report)


              precision    recall  f1-score   support

           0       0.81      0.82      0.82       165
           1       0.71      0.70      0.71       103

    accuracy                           0.78       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268

Without Data Normalization

Accuracy score 0.7761194029850746

Confusion Matrix
[[136  29]
 [ 31  72]]

Classification Report
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       165
           1       0.71      0.70      0.71       103

    accuracy                           0.78       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268



### With Normalization

In [13]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 2))
names = data.columns
d = scaler.fit_transform(data)
scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0.0,2.0,2.0,0.542347,0.028302
1,2.0,0.0,0.0,0.944458,0.278271
2,2.0,2.0,0.0,0.642875,0.030937
3,2.0,0.0,0.0,0.869063,0.207289
4,0.0,2.0,2.0,0.869063,0.031425


In [14]:
X1=scaled_df.drop('Survived', axis=1)
Y1=scaled_df['Survived']

In [15]:
#split the dataset into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(X1,Y1, test_size=0.3, random_state=25)

In [16]:
#The Logistic Regression
from sklearn.linear_model import LogisticRegression
logit=LogisticRegression()
logit.fit(X_train,Y_train)


LogisticRegression()

In [17]:
Y_pred1=logit.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
cm1=confusion_matrix(Y_test,Y_pred1)
report1=classification_report(Y_test,Y_pred1)
print(report)
print("Without Data Normalization")
print()
print("Accuracy score",accuracy_score(Y_test,Y_pred1))
print()
print("Confusion Matrix")
print(cm1)
print()
print("Classification Report")
print(report1)


              precision    recall  f1-score   support

           0       0.81      0.82      0.82       165
           1       0.71      0.70      0.71       103

    accuracy                           0.78       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268

Without Data Normalization

Accuracy score 0.7798507462686567

Confusion Matrix
[[139  26]
 [ 33  70]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.81      0.84      0.82       165
         2.0       0.73      0.68      0.70       103

    accuracy                           0.78       268
   macro avg       0.77      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268

