In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

### 1.Load Data

In [3]:
# Load dataset from the provided link
url = 'https://media.githubusercontent.com/media/Marsh16/smoking-drinking-ALP-ML/main/smoking_drinking_data.csv'
df = pd.read_csv(url)

# Display the first few rows of the dataset
df.head()
df.describe()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,...,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd
count,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,...,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0,991346.0
mean,47.614491,162.240625,63.28405,81.233358,0.980834,0.978429,1.031495,1.030476,122.432498,76.052627,...,56.9368,113.037692,132.141751,14.229824,1.094224,0.860467,25.989308,25.755051,37.136347,1.608122
std,14.181339,9.282957,12.514241,11.850323,0.605949,0.604774,0.17465,0.171892,14.543148,9.889365,...,17.238479,35.842812,102.196985,1.584929,0.437724,0.48053,23.493386,26.308599,50.424153,0.818507
min,20.0,130.0,25.0,8.0,0.1,0.1,1.0,1.0,67.0,32.0,...,1.0,1.0,1.0,1.0,1.0,0.1,1.0,1.0,1.0,1.0
25%,35.0,155.0,55.0,74.1,0.7,0.7,1.0,1.0,112.0,70.0,...,46.0,89.0,73.0,13.2,1.0,0.7,19.0,15.0,16.0,1.0
50%,45.0,160.0,60.0,81.0,1.0,1.0,1.0,1.0,120.0,76.0,...,55.0,111.0,106.0,14.3,1.0,0.8,23.0,20.0,23.0,1.0
75%,60.0,170.0,70.0,87.8,1.2,1.2,1.0,1.0,131.0,82.0,...,66.0,135.0,159.0,15.4,1.0,1.0,28.0,29.0,39.0,2.0
max,85.0,190.0,140.0,999.0,9.9,9.9,2.0,2.0,273.0,185.0,...,8110.0,5119.0,9490.0,25.0,6.0,98.0,9999.0,7210.0,999.0,3.0


### 2. Data Preprocessing

In [4]:
# Data Preprocessing
df = df.where((pd.notnull(df)), '')  # Handling missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991346 entries, 0 to 991345
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sex               991346 non-null  object 
 1   age               991346 non-null  int64  
 2   height            991346 non-null  int64  
 3   weight            991346 non-null  int64  
 4   waistline         991346 non-null  float64
 5   sight_left        991346 non-null  float64
 6   sight_right       991346 non-null  float64
 7   hear_left         991346 non-null  float64
 8   hear_right        991346 non-null  float64
 9   SBP               991346 non-null  float64
 10  DBP               991346 non-null  float64
 11  BLDS              991346 non-null  float64
 12  tot_chole         991346 non-null  float64
 13  HDL_chole         991346 non-null  float64
 14  LDL_chole         991346 non-null  float64
 15  triglyceride      991346 non-null  float64
 16  hemoglobin        99

In [5]:
# Check if there are missing values in the dataset
missing_values = df.isnull().sum()
print(missing_values)

duplicates = df.duplicated().sum()
print("")
print(df.index[df.duplicated(keep=False)])
print("duplicates : {:0.0f}".format(duplicates))

df.drop_duplicates(inplace=True)

duplicates = df.duplicated().sum()
print("duplicates after clearing: {:0.0f}".format(duplicates))


sex                 0
age                 0
height              0
weight              0
waistline           0
sight_left          0
sight_right         0
hear_left           0
hear_right          0
SBP                 0
DBP                 0
BLDS                0
tot_chole           0
HDL_chole           0
LDL_chole           0
triglyceride        0
hemoglobin          0
urine_protein       0
serum_creatinine    0
SGOT_AST            0
SGOT_ALT            0
gamma_GTP           0
SMK_stat_type_cd    0
DRK_YN              0
dtype: int64

Int64Index([ 12101,  36972,  39808,  61934,  82306,  82607,  99422, 115929,
            118930, 126538, 133412, 159911, 175152, 184489, 186560, 211709,
            231468, 246305, 271717, 280830, 284051, 284528, 290463, 323132,
            335747, 354088, 429596, 445608, 453451, 471596, 479756, 555137,
            558263, 568854, 606663, 626044, 629549, 668305, 671067, 686628,
            727207, 746077, 770036, 779854, 794384, 803956, 804343, 834790,
  

In [6]:
# Check which column needs fixing
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print()

Unique values in 'sex':
['Male' 'Female']

Unique values in 'age':
[35 30 40 50 45 55 65 25 60 20 70 75 80 85]

Unique values in 'height':
[170 180 165 175 150 155 160 145 140 185 135 190 130]

Unique values in 'weight':
[ 75  80  60  55  65  50  85  70  45  40  95 120  90  35 105 100 110 115
 130  30 125 140  25 135]

Unique values in 'waistline':
[ 90.   89.   91.   80.   75.   69.   84.2  84.   82.   79.2  98.   72.3
  88.   76.   73.   78.   99.   85.   67.   62.   92.   79.   87.   70.
  67.5  87.3  71.   92.9  94.   79.3  77.   75.7  85.5  74.   60.   81.
  72.   65.   63.   81.6  83.   61.  110.   86.8  73.5  93.  109.   54.
  91.2  66.   79.5  86.   97.1  76.2  80.5  68.   64.   74.1  85.9  65.3
  95.   94.5 100.   85.4  77.6  73.3 103.   93.5  67.8  69.2 105.7 105.
  74.2  97.   75.4  83.2  88.5  85.3  87.4  71.5  64.1  76.6  93.1  84.8
  88.1  66.8  96.   81.5  80.1  87.2  86.5 104.  114.   56.   88.8  89.2
  66.2  90.8  88.2  82.5  65.4  72.2  81.3  75.6  87.8  77.2  98.5  8

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 991320 entries, 0 to 991345
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sex               991320 non-null  int64  
 1   age               991320 non-null  int64  
 2   height            991320 non-null  int64  
 3   weight            991320 non-null  int64  
 4   waistline         991320 non-null  float64
 5   sight_left        991320 non-null  float64
 6   sight_right       991320 non-null  float64
 7   hear_left         991320 non-null  float64
 8   hear_right        991320 non-null  float64
 9   SBP               991320 non-null  float64
 10  DBP               991320 non-null  float64
 11  BLDS              991320 non-null  float64
 12  tot_chole         991320 non-null  float64
 13  HDL_chole         991320 non-null  float64
 14  LDL_chole         991320 non-null  float64
 15  triglyceride      991320 non-null  float64
 16  hemoglobin        99

In [53]:
# Create a copy of the DataFrame
df = df.copy()

# Encode categorical features
label_encoder = LabelEncoder()
df['sex'] = label_encoder.fit_transform(df['sex'])
df['DRK_YN'] = label_encoder.fit_transform(df['DRK_YN'])

# Select features and target variable
x = df[['sex', 'age', 'height', 'weight','waistline', 'sight_left', 'sight_right',
        'hear_left', 'hear_right', 'SBP', 'DBP', 'BLDS', 'tot_chole', 'HDL_chole',
        'LDL_chole', 'triglyceride', 'hemoglobin', 'urine_protein',
        'serum_creatinine', 'SGOT_AST', 'SGOT_ALT', 'gamma_GTP']]
y = df[["SMK_stat_type_cd"]]
y2 = df[["DRK_YN"]]

# Display the selected features and target variable
print("Selected Features (x):")
print(x.head())  # Display the first few rows of x
print("\nTarget Variable (y):")
print(y.head())  # Display the first few rows of y

Selected Features (x):
   sex  age  height  weight  waistline  sight_left  sight_right  hear_left  \
0    1   35     170      75       90.0         1.0          1.0        1.0   
1    1   30     180      80       89.0         0.9          1.2        1.0   
2    1   40     165      75       91.0         1.2          1.5        1.0   
3    1   50     175      80       91.0         1.5          1.2        1.0   
4    1   50     165      60       80.0         1.0          1.2        1.0   

   hear_right    SBP  ...  tot_chole  HDL_chole  LDL_chole  triglyceride  \
0         1.0  120.0  ...      193.0       48.0      126.0          92.0   
1         1.0  130.0  ...      228.0       55.0      148.0         121.0   
2         1.0  120.0  ...      136.0       41.0       74.0         104.0   
3         1.0  145.0  ...      201.0       76.0      104.0         106.0   
4         1.0  138.0  ...      199.0       61.0      117.0         104.0   

   hemoglobin  urine_protein  serum_creatinine  SGO

In [9]:
# ReCheck which column needs fixing
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print()


Unique values in 'sex':
[1 0]

Unique values in 'age':
[35 30 40 50 45 55 65 25 60 20 70 75 80 85]

Unique values in 'height':
[170 180 165 175 150 155 160 145 140 185 135 190 130]

Unique values in 'weight':
[ 75  80  60  55  65  50  85  70  45  40  95 120  90  35 105 100 110 115
 130  30 125 140  25 135]

Unique values in 'waistline':
[ 90.   89.   91.   80.   75.   69.   84.2  84.   82.   79.2  98.   72.3
  88.   76.   73.   78.   99.   85.   67.   62.   92.   79.   87.   70.
  67.5  87.3  71.   92.9  94.   79.3  77.   75.7  85.5  74.   60.   81.
  72.   65.   63.   81.6  83.   61.  110.   86.8  73.5  93.  109.   54.
  91.2  66.   79.5  86.   97.1  76.2  80.5  68.   64.   74.1  85.9  65.3
  95.   94.5 100.   85.4  77.6  73.3 103.   93.5  67.8  69.2 105.7 105.
  74.2  97.   75.4  83.2  88.5  85.3  87.4  71.5  64.1  76.6  93.1  84.8
  88.1  66.8  96.   81.5  80.1  87.2  86.5 104.  114.   56.   88.8  89.2
  66.2  90.8  88.2  82.5  65.4  72.2  81.3  75.6  87.8  77.2  98.5  85.2
  97.5  

### Untuk Model Smoker


In [73]:
# Select the first 1000 columns from x
x_smoker = x.iloc[:1000]

# Select the first 1000 columns from y
y_smoker = y.iloc[:1000]

# Split the dataset into training and testing sets
train_x_smoker, test_x_smoker, train_y_smoker, test_y_smoker = train_test_split(x_smoker, y_smoker, test_size=0.20, random_state=0)

print("Train set:", train_x_smoker.shape, train_y_smoker.shape)
print("Test set:", test_x_smoker.shape, test_y_smoker.shape)

Train set: (800, 22) (800, 1)
Test set: (200, 22) (200, 1)


In [74]:
# Model Generation Random Forest
train_y__smoker_flattened = train_y_smoker.values.ravel()
model = RandomForestClassifier(n_estimators=600,random_state=1)
model.fit(train_x_smoker, train_y__smoker_flattened)

In [75]:
# Model Generation Decision Tree
model_tree = DecisionTreeClassifier(max_depth=2)
model_tree.fit(train_x_smoker, train_y_smoker)

In [76]:
# Create a KNN model
knn_model = KNeighborsClassifier()

# Define the hyperparameter grid to search
param_grid = {'n_neighbors': np.arange(1, 11)}  # Adjust the range as needed

# Use GridSearchCV to perform the grid search with 'f1_weighted' scoring
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='f1_weighted')  # You can adjust the cross-validation folds (cv) as needed

# Fit the grid search to the data
grid_search.fit(train_x_smoker, train_y__smoker_flattened)

# Print the best parameter(s) found
print("Best parameters:", grid_search.best_params_)

# Print the best cross-validated score
print("Best cross-validated score:", grid_search.best_score_)

Best parameters: {'n_neighbors': 7}
Best cross-validated score: 0.5555767704698401


In [79]:
# Create a KNN model
knn_model = KNeighborsClassifier(n_neighbors=7)  # You can adjust the number of neighbors (k) as needed

# Train the KNN model
knn_model.fit(train_x_smoker, train_y_smoker)

  return self._fit(X, y)


In [78]:
# Model Generation SVM
model_svm = svm.SVC(kernel = 'rbf', probability =True)
model_svm.fit(train_x_smoker, train_y_smoker)

  y = column_or_1d(y, warn=True)


### Untuk Model Drinkers

In [81]:
# Select the first 1000 columns from x
x_drinker = x.iloc[:1000]

# Select the first 1000 columns from y
y2_drinker = y2.iloc[:1000]

# Split the dataset into training and testing sets
train_x_drinker, test_x_drinker, train_y_drinker, test_y_drinker = train_test_split(x_drinker, y2_drinker, test_size=0.20, random_state=0)

print("Train set:", train_x_drinker.shape, train_y_drinker.shape)
print("Test set:", test_x_drinker.shape, test_y_drinker.shape)

Train set: (800, 22) (800, 1)
Test set: (200, 22) (200, 1)


In [82]:
# Model Generation Random Forest
train_y__drinker_flattened = train_y_drinker.values.ravel()
model = RandomForestClassifier(n_estimators=600,random_state=1)
model.fit(train_x_drinker, train_y__drinker_flattened)

In [83]:
# Model Generation Decision Tree
model_tree = DecisionTreeClassifier(max_depth=2)
model_tree.fit(train_x_drinker, train_y_drinker)

In [84]:
# Model Generation SVM
model_svm = svm.SVC(kernel = 'rbf', probability =True)
model_svm.fit(train_x_drinker, train_y_drinker)

  y = column_or_1d(y, warn=True)


In [85]:
# Create a KNN model
knn_model = KNeighborsClassifier()

# Define the hyperparameter grid to search
param_grid = {'n_neighbors': np.arange(1, 11)}  # Adjust the range as needed

# Use GridSearchCV to perform the grid search with 'f1_weighted' scoring
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='f1_weighted')  # You can adjust the cross-validation folds (cv) as needed

# Fit the grid search to the data
grid_search.fit(train_x_drinker, train_y__drinker_flattened)

# Print the best parameter(s) found
print("Best parameters:", grid_search.best_params_)

# Print the best cross-validated score
print("Best cross-validated score:", grid_search.best_score_)

Best parameters: {'n_neighbors': 9}
Best cross-validated score: 0.6330360771750927


In [86]:
# Create a KNN model
knn_model = KNeighborsClassifier(n_neighbors=9)  # You can adjust the number of neighbors (k) as needed

# Train the KNN model
knn_model.fit(train_x_drinker, train_y_drinker)

  return self._fit(X, y)
