<style>
    * {
        box-sizing: border-box;
        color: white;
    }

    #container {
        padding: 1em;
        border: 5px solid white;
        display: flex;
        flex-direction: column;
        align-items: center;
    }

    #logo {
        width: 160px;
        height: 160px;
        border-radius: 50%;
        border: 2px solid white;
    }

    h1 {
        border-bottom: 3px solid white;
        font-weight: bold;
    }

    #inner-content {
        width: 100%;
    }
</style>
<div id="container">
    <img src="https://imageio.forbes.com/specials-images/dam/imageserve/1129869424/960x0.jpg?format=jpg&width=960" id="logo" alt="ML Logo" />
    <h1>ML - Practical 2</h1>
    <div id="inner-content">
        Implement the data preprocessing techniques on the same selected data of your choosen domain and provide a detailed inference related to each and every preprocessing task like:
        <ol>
            <li>Identify unique values</li>
            <li>Finding missing values</li>
            <li>Replacing missing values and null values</li>
            <li>Normalising data using standard scaler or minmax scaler</li>
            <li>Apply KNN for the preprocessed dataset use 5 different K Values and 3 different distance measures</li>
            <li>Display the confusion matrix, accuracy measures, classification report</li>
            <li>Provide appropriate inference for every task performed</li>
        </ol>
    </div>
</div>

In [1]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd
import numpy as np
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("./dataset/All_India_Index_july2019_20Aug2020.csv")

In [3]:
# 1. Identifying unique values
df.Sector.unique()

array(['Rural', 'Urban', 'Rural+Urban'], dtype=object)

In [4]:
# 2. Finding missing values
df.isnull()

Unnamed: 0,Sno,Sector,Year,Month,Cereals and products,Meat and fish,Egg,Milk and products,Oils and fats,Fruits,...,Housing,Fuel and light,Household goods and services,Health,Transport and communication,Recreation and amusement,Education,Personal care and effects,Miscellaneous,General index
0,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,False,False,False,False,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
263,False,False,False,False,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
264,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
265,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
# 3. Replacing missing and null values
df.replace(np.nan, 0)

Unnamed: 0,Sno,Sector,Year,Month,Cereals and products,Meat and fish,Egg,Milk and products,Oils and fats,Fruits,...,Housing,Fuel and light,Household goods and services,Health,Transport and communication,Recreation and amusement,Education,Personal care and effects,Miscellaneous,General index
0,1,Rural,2013,January,107.5,106.3,108.1,104.9,106.1,103.9,...,0.0,105.5,104.8,104.0,103.3,103.4,103.8,104.7,104.0,105.1
1,2,Urban,2013,January,110.5,109.1,113.0,103.6,103.4,102.3,...,100.3,105.4,104.8,104.1,103.2,102.9,103.5,104.3,103.7,104.0
2,3,Rural+Urban,2013,January,108.4,107.3,110.0,104.4,105.1,103.2,...,100.3,105.5,104.8,104.0,103.2,103.1,103.6,104.5,103.9,104.6
3,4,Rural,2013,February,109.2,108.7,110.2,105.4,106.7,104.0,...,0.0,106.2,105.2,104.4,103.9,104.0,104.1,104.6,104.4,105.8
4,5,Urban,2013,February,112.9,112.9,116.9,104.0,103.5,103.1,...,100.4,105.7,105.2,104.7,104.4,103.3,103.7,104.3,104.3,104.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,263,Urban,2020,May,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
263,264,Rural+Urban,2020,May,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
264,265,Rural,2020,June,148.2,190.3,149.4,153.3,138.2,143.2,...,0.0,144.9,151.7,158.2,141.4,153.2,161.8,151.2,151.7,152.7
265,266,Urban,2020,June,152.7,197.0,154.6,153.4,132.9,151.8,...,154.7,137.1,140.4,148.1,129.3,144.5,152.5,152.2,142.0,150.8


In [6]:
# 4. Normalizing the data using standard scaler or minmax scaler
scaler = MinMaxScaler()
scaler.fit_transform(df[["Meat and fish"]])

array([[0.        ],
       [0.030871  ],
       [0.01102536],
       [0.02646086],
       [0.07276736],
       [0.0429989 ],
       [0.0275634 ],
       [0.05622933],
       [0.03748622],
       [0.03528115],
       [0.07828004],
       [0.05071665],
       [0.03858875],
       [0.08710033],
       [0.05512679],
       [0.06394708],
       [0.15214994],
       [0.09481808],
       [0.09481808],
       [0.14222712],
       [0.11135612],
       [0.10033076],
       [0.15545755],
       [0.12017641],
       [0.10363837],
       [0.14112459],
       [0.1168688 ],
       [0.10033076],
       [0.13009923],
       [0.11025358],
       [0.09481808],
       [0.11025358],
       [0.10033076],
       [0.10584344],
       [0.13009923],
       [0.11466373],
       [0.11907387],
       [0.17309813],
       [0.13781698],
       [0.12568908],
       [0.17309813],
       [0.14222712],
       [0.13009923],
       [0.16648291],
       [0.14332966],
       [0.13891951],
       [0.16979052],
       [0.149

In [7]:
df = pd.read_csv("./dataset/bs140513_032310.csv/bs140513_032310.csv")
df

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0
...,...,...,...,...,...,...,...,...,...,...
594638,179,'C1753498738','3','F','28007','M1823072687','28007','es_transportation',20.53,0
594639,179,'C650108285','4','F','28007','M1823072687','28007','es_transportation',50.73,0
594640,179,'C123623130','2','F','28007','M349281107','28007','es_fashion',22.44,0
594641,179,'C1499363341','5','M','28007','M1823072687','28007','es_transportation',14.46,0


In [8]:
df.groupby('category')[['amount', 'fraud']].mean()

Unnamed: 0_level_0,amount,fraud
category,Unnamed: 1_level_1,Unnamed: 2_level_1
'es_barsandrestaurants',43.461014,0.018829
'es_contents',44.547571,0.0
'es_fashion',65.666642,0.017973
'es_food',37.070405,0.0
'es_health',135.621367,0.105126
'es_home',165.670846,0.152064
'es_hotelservices',205.614249,0.31422
'es_hyper',45.970421,0.045917
'es_leisure',288.911303,0.9499
'es_otherservices',135.881524,0.25


In [9]:
df.category = pd.Series(LabelEncoder().fit_transform(df.category))
df

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007',12,4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007',12,39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007',12,26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007',12,17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007',12,35.72,0
...,...,...,...,...,...,...,...,...,...,...
594638,179,'C1753498738','3','F','28007','M1823072687','28007',12,20.53,0
594639,179,'C650108285','4','F','28007','M1823072687','28007',12,50.73,0
594640,179,'C123623130','2','F','28007','M349281107','28007',2,22.44,0
594641,179,'C1499363341','5','M','28007','M1823072687','28007',12,14.46,0


In [10]:
features = df[['category', 'amount']]
label = df[['fraud']]
print(features, label)

        category  amount
0             12    4.55
1             12   39.68
2             12   26.89
3             12   17.25
4             12   35.72
...          ...     ...
594638        12   20.53
594639        12   50.73
594640         2   22.44
594641        12   14.46
594642        12   26.93

[594643 rows x 2 columns]         fraud
0           0
1           0
2           0
3           0
4           0
...       ...
594638      0
594639      0
594640      0
594641      0
594642      0

[594643 rows x 1 columns]


In [11]:
sm = SMOTE(random_state=42)
x_res, y_res = sm.fit_resample(features, label)
y_res.value_counts()

fraud
0        587443
1        587443
dtype: int64

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.3, random_state=42, shuffle=True, stratify=y_res)

In [13]:
x_train

Unnamed: 0,category,amount
903303,10,302.701771
721176,10,599.604694
680561,10,321.541503
1154749,13,2086.201087
752766,5,1135.689926
...,...,...
705232,10,98.665784
652698,4,99.233569
465780,12,4.260000
246158,12,32.800000


In [17]:
y_train.values

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [1]], dtype=int64)

In [16]:
knn = KNeighborsClassifier(n_neighbors=2, p=1)

knn.fit(x_train, y_train.values.ravel())
y_pred = knn.predict(x_test)

print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95    176233
           1       0.97      0.93      0.95    176233

    accuracy                           0.95    352466
   macro avg       0.95      0.95      0.95    352466
weighted avg       0.95      0.95      0.95    352466

Confusion matrix:
 [[171442   4791]
 [ 12045 164188]]
Accuracy score: 0.9522336906254788


In [None]:
knn = KNeighborsClassifier(n_neighbors=3, p=2)

knn.fit(x_train, y_train.values.ravel())
y_pred = knn.predict(x_test)

print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96    176233
           1       0.96      0.97      0.96    176233

    accuracy                           0.96    352466
   macro avg       0.96      0.96      0.96    352466
weighted avg       0.96      0.96      0.96    352466

Confusion matrix:
 [[168549   7684]
 [  5004 171229]]
Accuracy score: 0.9640022016307956


In [None]:
knn = KNeighborsClassifier(n_neighbors=4, p=3)

knn.fit(x_train, y_train.values.ravel())
y_pred = knn.predict(x_test)

print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96    176233
           1       0.96      0.96      0.96    176233

    accuracy                           0.96    352466
   macro avg       0.96      0.96      0.96    352466
weighted avg       0.96      0.96      0.96    352466

Confusion matrix:
 [[169886   6347]
 [  7244 168989]]
Accuracy score: 0.9614402523931387


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(x_train, y_train.values.ravel())
y_pred = knn.predict(x_test)

print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.97    176233
           1       0.96      0.98      0.97    176233

    accuracy                           0.97    352466
   macro avg       0.97      0.97      0.97    352466
weighted avg       0.97      0.97      0.97    352466

Confusion matrix:
 [[168224   8009]
 [  4085 172148]]
Accuracy score: 0.9656874705645367


In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(x_train, y_train.values.ravel())
y_pred = knn.predict(x_test)

print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97    176233
           1       0.96      0.98      0.97    176233

    accuracy                           0.97    352466
   macro avg       0.97      0.97      0.97    352466
weighted avg       0.97      0.97      0.97    352466

Confusion matrix:
 [[168783   7450]
 [  4271 171962]]
Accuracy score: 0.9667457286660274
