SAME FOR PROBLEM STATEMENT 14, 15, 16 & 26

##**PREPROCESSING**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report

import io #
from google.colab import files #

In [None]:
# Upload the dataset
# Download 'heart_cleveland_upload.csv' from Kaggle and upload here
uploaded = files.upload()

# Load the dataset (assumes single CSV upload)
df = pd.read_csv(io.BytesIO(list(uploaded.values())[0]))

Saving adult_dataset_FINAL.csv to adult_dataset_FINAL (1).csv


In [None]:
print(df.shape)
df.head()

(48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


##**A: DATA CLEANING**

In [None]:
# 1. Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [None]:
df.shape

(48842, 15)

In [None]:
# 4. Drop rows with any NaNs remaining
df.dropna(inplace=True)

df.shape

(45222, 15)

In [None]:
le = LabelEncoder()
df[df.select_dtypes(include=['object','category']).columns] = df[df.select_dtypes(include=['object','category']).columns].apply(le.fit_transform)
#

In [None]:
# 3. Remove negative values (for columns that shouldn't be negative)
for col in df.columns:
    if df[col].min() < 0:
        df = df[df[col] >= 0]

df.shape

(45222, 15)

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,38,0
1,38,2,89814,11,9,2,4,0,4,1,0,0,50,38,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,38,1
3,44,2,160323,15,10,2,6,0,2,1,7688,0,40,38,1
5,34,2,198693,0,6,4,7,1,4,1,0,0,30,38,0


##**B: ERROR CORRECTING (Outlier Detection)**

In [None]:
cols = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']

Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Keep only rows where all specified columns are within IQR bounds
df = df[~((df[cols] < lower_bound) | (df[cols] > upper_bound)).any(axis=1)]

df.shape


(28795, 15)

##**C: DATA TRANSFORMATION**

In [None]:
# 1. Split into features and target
X = df.drop(columns='income')  # Assuming 'target' column exists
y = df['income']

In [None]:
y.unique()

array([0, 1])

In [None]:
# 3. Skewness resolution (using PowerTransformer for centering and normality)
pt = PowerTransformer()
X = pd.DataFrame(pt.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,-1.170793,-0.075866,0.477356,-2.132594,-1.217276,0.968741,0.220462,1.13015,-2.372309,0.68778,0.0,0.0,-0.250536,0.310864
1,0.10522,-0.075866,-1.016943,0.098744,-0.45878,-0.320052,-0.264618,-1.086547,0.423228,0.68778,0.0,0.0,1.814819,0.310864
2,-0.837126,-1.32479,1.311927,-1.01946,0.802611,-0.320052,1.030234,-1.086547,0.423228,0.68778,0.0,0.0,-0.250536,0.310864
3,-1.28889,-0.075866,1.527766,1.454894,-0.053896,0.968741,0.439183,1.378845,0.423228,-1.453954,0.0,0.0,-0.250536,0.310864
4,-0.066278,-3.007258,0.35231,-0.4924,1.252772,-0.320052,-1.66818,-1.086547,0.423228,0.68778,0.0,0.0,-0.250536,0.310864


In [None]:
# 4. Scaling (mean = 0, std = 1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,-1.170793,-0.075866,0.477356,-2.132594,-1.217276,0.968741,0.220462,1.13015,-2.372309,0.68778,0.0,0.0,-0.250536,0.310864
1,0.10522,-0.075866,-1.016943,0.098744,-0.45878,-0.320052,-0.264618,-1.086547,0.423228,0.68778,0.0,0.0,1.814819,0.310864
2,-0.837126,-1.32479,1.311927,-1.01946,0.802611,-0.320052,1.030234,-1.086547,0.423228,0.68778,0.0,0.0,-0.250536,0.310864
3,-1.28889,-0.075866,1.527766,1.454894,-0.053896,0.968741,0.439183,1.378845,0.423228,-1.453954,0.0,0.0,-0.250536,0.310864
4,-0.066278,-3.007258,0.35231,-0.4924,1.252772,-0.320052,-1.66818,-1.086547,0.423228,0.68778,0.0,0.0,-0.250536,0.310864


##**D. DATA MODELING**

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 1. Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

acc_lr

0.8267060253516235

In [None]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      4559
           1       0.66      0.35      0.46      1200

    accuracy                           0.83      5759
   macro avg       0.75      0.65      0.68      5759
weighted avg       0.81      0.83      0.81      5759



In [None]:
# 2. k-NN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)

acc_knn

0.8171557562076749

In [None]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      4559
           1       0.58      0.47      0.52      1200

    accuracy                           0.82      5759
   macro avg       0.72      0.69      0.70      5759
weighted avg       0.81      0.82      0.81      5759



In [None]:
# 3. Naive-Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

acc_nb

0.7808647334606702

In [None]:
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.91      0.80      0.85      4559
           1       0.48      0.70      0.57      1200

    accuracy                           0.78      5759
   macro avg       0.70      0.75      0.71      5759
weighted avg       0.82      0.78      0.79      5759

