In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
diab=pd.read_csv("diabetes.csv")
adu=pd.read_csv("adult.csv")
print("\nDIABETES DATASET CLEANING:\n")
print(diab.head())


DIABETES DATASET CLEANING:

    ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  


In [48]:
print("Missing values in diabetes dataset are:")
print(diab.isna())
print("Filling missing values in diabetes dataset:")
print(diab.fillna(diab.mean(numeric_only=True), inplace=True))

Missing values in diabetes dataset are:
        ID  No_Pation  Gender    AGE   Urea     Cr  HbA1c   Chol     TG  \
0    False      False   False  False  False  False  False  False  False   
1    False      False   False  False  False  False  False  False  False   
2    False      False   False  False  False  False  False  False  False   
3    False      False   False  False  False  False  False  False  False   
4    False      False   False  False  False  False  False  False  False   
..     ...        ...     ...    ...    ...    ...    ...    ...    ...   
995  False      False   False  False  False  False  False  False  False   
996  False      False   False  False  False  False  False  False  False   
997  False      False   False  False  False  False  False  False  False   
998  False      False   False  False  False  False  False  False  False   
999  False      False   False  False  False  False  False  False  False   

       HDL    LDL   VLDL    BMI  CLASS  
0    False  False 

In [49]:
le = LabelEncoder()
diab['Gender'] = le.fit_transform(diab['Gender'])
diab['CLASS'] = le.fit_transform(diab['CLASS'])
print("After handling categorical data:")
print(diab[['Gender', 'CLASS']].head())



After handling categorical data:
   Gender  CLASS
0       0      0
1       1      0
2       0      0
3       0      0
4       1      0


In [50]:
Q1 = diab.quantile(0.25)
Q3 = diab.quantile(0.75)
IQR = Q3 - Q1
diab = diab[~((diab < (Q1 - 1.5 * IQR)) |
              (diab > (Q3 + 1.5 * IQR))).any(axis=1)]
print("\nAfter handling outliers:")
print(diab.head())



After handling outliers:
      ID  No_Pation  Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
169  714      34268       1   45   3.6  80    5.0   6.1  3.7  0.7  3.9   1.7   
170   87      24000       0   50   4.0  56   13.7   4.4  2.0  1.0  2.5   0.9   
171  231      14389       0   50   4.0  56   13.7   4.4  2.0  1.0  2.5   0.9   
172  505      23044       0   48   4.0  38    6.5   4.4  2.3  1.3  2.2   1.0   
174    7      34278       0   46   3.0  59    5.1   5.7  3.8  1.3  2.8   1.7   

      BMI  CLASS  
169  22.0      3  
170  29.0      3  
171  29.0      3  
172  23.0      3  
174  24.0      3  


In [51]:
print("DATA TRANSFORMATIONS")
minmax = MinMaxScaler()
diab_minmax = minmax.fit_transform(diab)

diab_minmax = pd.DataFrame(diab_minmax, columns=diab.columns)
standard = StandardScaler()
diab_standard = standard.fit_transform(diab)

diab_standard = pd.DataFrame(diab_standard, columns=diab.columns)
print("Original Data (after cleaning):")
print(diab.head())

print("\nMin-Max Normalized Data:")
print(diab_minmax.head())

print("\nStandard Scaled Data:")
print(diab_standard.head())

DATA TRANSFORMATIONS
Original Data (after cleaning):
      ID  No_Pation  Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
169  714      34268       1   45   3.6  80    5.0   6.1  3.7  0.7  3.9   1.7   
170   87      24000       0   50   4.0  56   13.7   4.4  2.0  1.0  2.5   0.9   
171  231      14389       0   50   4.0  56   13.7   4.4  2.0  1.0  2.5   0.9   
172  505      23044       0   48   4.0  38    6.5   4.4  2.3  1.3  2.2   1.0   
174    7      34278       0   46   3.0  59    5.1   5.7  3.8  1.3  2.8   1.7   

      BMI  CLASS  
169  22.0      3  
170  29.0      3  
171  29.0      3  
172  23.0      3  
174  24.0      3  

Min-Max Normalized Data:
         ID  No_Pation  Gender       AGE      Urea        Cr     HbA1c  \
0  0.892231   0.452186     0.5  0.193548  0.328947  0.690476  0.230769   
1  0.106516   0.316206     0.0  0.354839  0.381579  0.404762  0.900000   
2  0.286967   0.188926     0.0  0.354839  0.381579  0.404762  0.900000   
3  0.630326   0.303545     0.0

In [52]:
print("\n\nADULT DATASET CLEANING:\n\n")
print(adult.head())




ADULT DATASET CLEANING:


   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-p

In [53]:
print("\n\nADULT DATASET CLEANING:\n\n")
print(adult.head())




ADULT DATASET CLEANING:


   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-p

In [54]:
adu.replace("?", np.nan, inplace=True)
adu.fillna(adu.mode().iloc[0], inplace=True)
print("After handling missing values:")
print(adu.head())


After handling missing values:
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18    Private  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4     Prof-specialty    Own-child  White  Female             0             0   

   hour

In [55]:
le = LabelEncoder()

categorical_cols = adu.select_dtypes(include='object').columns

for col in categorical_cols:
    adu[col] = le.fit_transform(adu[col])

print("\nAfter categorical data handling:")
print(adu.head())



After categorical data handling:
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0   25          3  226802          1                7               4   
1   38          3   89814         11                9               2   
2   28          1  336951          7               12               2   
3   44          3  160323         15               10               2   
4   18          3  103497         15               10               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1             0             0   
1           4             0     4       1             0             0   
2          10             0     4       1             0             0   
3           6             0     2       1          7688             0   
4           9             3     4       0             0             0   

   hours-per-week  native-country  income  
0              40              38       0  


In [56]:
Q1 = adu.quantile(0.25)
Q3 = adu.quantile(0.75)
IQR = Q3 - Q1

adu = adu[~((adu < (Q1 - 1.5 * IQR)) |
            (adu > (Q3 + 1.5 * IQR))).any(axis=1)]

print("\nAfter outlier handling:")
print(adu.head())
print("\nNumber of rows after outlier removal:", adu.shape[0])




After outlier handling:
    age  workclass  fnlwgt  education  educational-num  marital-status  \
1    38          3   89814         11                9               2   
8    24          3  369667         15               10               4   
12   26          3   82091         11                9               4   
13   58          3  299831         11                9               2   
24   25          3  205947          9               13               2   

    occupation  relationship  race  gender  capital-gain  capital-loss  \
1            4             0     4       1             0             0   
8            7             4     4       0             0             0   
12           0             1     4       0             0             0   
13           9             0     4       1             0             0   
24           9             0     4       1             0             0   

    hours-per-week  native-country  income  
1               50              38      

In [57]:
minmax = MinMaxScaler()
adu_minmax = minmax.fit_transform(adu)

adu_minmax = pd.DataFrame(adu_minmax, columns=adu.columns)

print("\nAfter Min-Max Scaling:")
print(adu_minmax.head())



After Min-Max Scaling:
        age  workclass    fnlwgt  education  educational-num  marital-status  \
0  0.344262        0.0  0.188277   0.555556         0.363636        0.333333   
1  0.114754        0.0  0.881156   1.000000         0.454545        0.666667   
2  0.147541        0.0  0.169156   0.555556         0.363636        0.666667   
3  0.672131        0.0  0.708251   0.555556         0.363636        0.333333   
4  0.131148        0.0  0.475807   0.333333         0.727273        0.333333   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0    0.307692           0.0   0.0     1.0           0.0           0.0   
1    0.538462           0.8   0.0     0.0           0.0           0.0   
2    0.000000           0.2   0.0     0.0           0.0           0.0   
3    0.692308           0.0   0.0     1.0           0.0           0.0   
4    0.692308           0.0   0.0     1.0           0.0           0.0   

   hours-per-week  native-country  income  
0        0.8

In [58]:
standard = StandardScaler()
adu_standard = standard.fit_transform(adu)

adu_standard = pd.DataFrame(adu_standard, columns=adu.columns)

print("\nAfter Standard Scaling:")
print(adu_standard.head())



After Standard Scaling:
        age  workclass    fnlwgt  education  educational-num  marital-status  \
0  0.220179        0.0 -1.022983  -0.151256        -0.654083       -0.398228   
1 -0.955630        0.0  2.234629   1.457372        -0.073261        0.828047   
2 -0.787657        0.0 -1.112882  -0.151256        -0.654083        0.828047   
3  1.899906        0.0  1.421707  -0.151256        -0.654083       -0.398228   
4 -0.871644        0.0  0.328856  -0.955571         1.669207       -0.398228   

   occupation  relationship  race    gender  capital-gain  capital-loss  \
0   -0.420679     -1.044582   0.0  0.770972           0.0           0.0   
1    0.305840      1.629927   0.0 -1.297064           0.0           0.0   
2   -1.389371     -0.375955   0.0 -1.297064           0.0           0.0   
3    0.790186     -1.044582   0.0  0.770972           0.0           0.0   
4    0.790186     -1.044582   0.0  0.770972           0.0           0.0   

   hours-per-week  native-country  income  