In [146]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,roc_auc_score,roc_curve

In [89]:
df=pd.read_csv("smoking.csv")
df.columns

Index(['ID', 'gender', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)',
       'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)',
       'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol',
       'triglyceride', 'HDL', 'LDL', 'hemoglobin', 'Urine protein',
       'serum creatinine', 'AST', 'ALT', 'Gtp', 'oral', 'dental caries',
       'tartar', 'smoking'],
      dtype='object')

In [90]:
df[["systolic","relaxation"]]

Unnamed: 0,systolic,relaxation
0,114.0,73.0
1,119.0,70.0
2,138.0,86.0
3,100.0,60.0
4,120.0,74.0
...,...,...
55687,110.0,68.0
55688,101.0,62.0
55689,117.0,72.0
55690,133.0,76.0


In [91]:
df.duplicated().sum()

0

In [92]:
df_g=df[(df["hemoglobin"]>16)&(df["relaxation"]>100)&(df["systolic"]>160)]

In [176]:
df_g[["systolic","relaxation","Gtp","triglyceride","serum creatinine"]]

Unnamed: 0,systolic,relaxation,Gtp,triglyceride,serum creatinine
2015,164.0,111.0,35.0,90.0,1.1
3036,168.0,114.0,76.0,183.0,1.2
3072,167.0,105.0,65.0,113.0,0.9
4355,172.0,108.0,61.0,184.0,1.1
4958,163.0,101.0,28.0,83.0,1.2
...,...,...,...,...,...
50903,172.0,108.0,61.0,184.0,1.1
50927,195.0,110.0,42.0,279.0,1.1
51137,199.0,122.0,64.0,213.0,1.0
54810,161.0,113.0,79.0,283.0,0.8


In [94]:
categorical_columns=df.select_dtypes(include='object').columns

In [95]:
df1=pd.get_dummies(df,columns=categorical_columns,drop_first=True)


In [96]:
df1.skew()

ID                    -8.642558e-17
age                    2.680535e-01
height(cm)            -1.422381e-01
weight(kg)             5.340423e-01
waist(cm)              2.423901e-01
eyesight(left)         9.987651e+00
eyesight(right)        1.005953e+01
hearing(left)          6.009185e+00
hearing(right)         5.939591e+00
systolic               4.697796e-01
relaxation             3.946024e-01
fasting blood sugar    4.509412e+00
Cholesterol            3.923554e-01
triglyceride           1.313403e+00
HDL                    1.955282e+00
LDL                    1.067351e+01
hemoglobin            -6.552370e-01
Urine protein          5.625088e+00
serum creatinine       9.401928e+00
AST                    2.514753e+01
ALT                    3.468680e+01
Gtp                    6.744798e+00
dental caries          1.399563e+00
smoking                5.506120e-01
gender_M              -5.637891e-01
tartar_Y              -2.236128e-01
dtype: float64

In [97]:
df1.isnull().sum()

ID                     0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
dental caries          0
smoking                0
gender_M               0
tartar_Y               0
dtype: int64

In [98]:
col = []
df1_s = df1.corr()["smoking"]

for i, j in enumerate(df1_s):
    if (j > 0.20 or j < -0.20) and df1_s.index[i] != 'smoking':
        col.append(df1_s.index[i])


In [99]:
col

['height(cm)',
 'weight(kg)',
 'waist(cm)',
 'triglyceride',
 'hemoglobin',
 'serum creatinine',
 'Gtp',
 'gender_M']

In [100]:
len(col)

8

In [101]:
from sklearn.preprocessing import StandardScaler
X=df1[col]
y=df1['smoking']
# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, perform feature scaling if needed
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# Create and train the Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_scaled, y_train)

# Step 4: Train Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Make Predictions
# Predict using both models
logistic_predictions = logistic_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test)

# Step 6: Evaluate the Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the Logistic Regression model
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_precision = precision_score(y_test, logistic_predictions)
logistic_recall = recall_score(y_test, logistic_predictions)
logistic_f1 = f1_score(y_test, logistic_predictions)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

# Print the evaluation metrics for both models
print("Logistic Regression:")
print("Accuracy:", logistic_accuracy)
print("Precision:", logistic_precision)
print("Recall:", logistic_recall)
print("F1-score:", logistic_f1)

print("\nRandom Forest:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)


Logistic Regression:
Accuracy: 0.7309453272286561
Precision: 0.6234226256364844
Recall: 0.6848249027237354
F1-score: 0.6526828137675282

Random Forest:
Accuracy: 0.8123709489182153
Precision: 0.7343532684283728
Recall: 0.7704280155642024
F1-score: 0.751958224543081


In [102]:
X.skew()

height(cm)         -0.142238
weight(kg)          0.534042
waist(cm)           0.242390
triglyceride        1.313403
hemoglobin         -0.655237
serum creatinine    9.401928
Gtp                 6.744798
gender_M           -0.563789
dtype: float64

In [103]:
q1=df1['serum creatinine'].quantile(.25)
q3=df1['serum creatinine'].quantile(.75)
iqr=q3-q1
lb=q1-1.5*iqr
ub=q3+1.5*iqr
df2=df1[(df1["serum creatinine"]>=lb) & (df1["serum creatinine"]<=ub)]

In [104]:
df2[col].skew()

height(cm)         -0.158640
weight(kg)          0.533937
waist(cm)           0.248331
triglyceride        1.318464
hemoglobin         -0.650689
serum creatinine    0.049741
Gtp                 6.730466
gender_M           -0.609231
dtype: float64

In [105]:
q1=df2['triglyceride'].quantile(.25)
q3=df2['triglyceride'].quantile(.75)
iqr=q3-q1
lb=q1-1.5*iqr
ub=q3+1.5*iqr
df3=df2[(df2["triglyceride"]>=lb) & (df2["triglyceride"]<=ub)]

In [106]:
q1=df3['Gtp'].quantile(.25)
q3=df3['Gtp'].quantile(.75)
iqr=q3-q1
lb=q1-1.5*iqr
ub=q3+1.5*iqr
df4=df3[(df3["Gtp"]>=lb) & (df3["Gtp"]<=ub)]

In [107]:
df4

Unnamed: 0,ID,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking,gender_M,tartar_Y
0,0,40,155,60,81.3,1.2,1.0,1.0,1.0,114.0,...,12.9,1.0,0.7,18.0,19.0,27.0,0,0,0,1
1,1,40,160,60,81.0,0.8,0.6,1.0,1.0,119.0,...,12.7,1.0,0.6,22.0,19.0,18.0,0,0,0,1
2,2,55,170,60,80.0,0.8,0.8,1.0,1.0,138.0,...,15.8,1.0,1.0,21.0,16.0,22.0,0,1,1,0
3,3,40,165,70,88.0,1.5,1.5,1.0,1.0,100.0,...,14.7,1.0,1.0,19.0,26.0,18.0,0,0,1,1
4,4,40,155,60,86.0,1.0,1.0,1.0,1.0,120.0,...,12.5,1.0,0.6,16.0,14.0,22.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55686,55673,60,150,50,75.0,1.0,1.2,1.0,1.0,102.0,...,12.6,1.0,0.8,26.0,21.0,14.0,0,0,0,1
55687,55676,40,170,65,75.0,0.9,0.9,1.0,1.0,110.0,...,12.3,1.0,0.6,14.0,7.0,10.0,1,0,0,1
55688,55681,45,160,50,70.0,1.2,1.2,1.0,1.0,101.0,...,14.0,1.0,0.9,20.0,12.0,14.0,0,0,0,1
55690,55684,60,165,60,78.0,0.8,1.0,1.0,1.0,133.0,...,14.4,1.0,0.7,20.0,19.0,18.0,0,0,1,0


In [108]:
df4[col].skew()

height(cm)         -0.098786
weight(kg)          0.556465
waist(cm)           0.260604
triglyceride        0.947991
hemoglobin         -0.652759
serum creatinine    0.080439
Gtp                 1.188197
gender_M           -0.457762
dtype: float64

In [139]:
from sklearn.preprocessing import StandardScaler
X=df4[col]
y=df4['smoking']
from imblearn.under_sampling import RandomUnderSampler

undersampler = RandomUnderSampler(random_state=42)

# Resample the data to balance the class distribution
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=10)

# Optionally, perform feature scaling if needed
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# Create and train the Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_scaled, y_train)

# Step 4: Train Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

# Step 5: Make Predictions
# Predict using both models
logistic_predictions = logistic_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test)
rf_predictions1 = rf_model.predict(X_train)

# Step 6: Evaluate the Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the Logistic Regression model
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_precision = precision_score(y_test, logistic_predictions)
logistic_recall = recall_score(y_test, logistic_predictions)
logistic_f1 = f1_score(y_test, logistic_predictions)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_train, rf_predictions1)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

# Print the evaluation metrics for both models
print("Logistic Regression:")
print("Accuracy:", logistic_accuracy)
print("Precision:", logistic_precision)
print("Recall:", logistic_recall)
print("F1-score:", logistic_f1)

print("\nRandom Forest:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)


Logistic Regression:
Accuracy: 0.7666289043534552
Precision: 0.7004362578768784
Recall: 0.9334625322997416
F1-score: 0.8003323179174744

Random Forest:
Accuracy: 1.0
Precision: 0.7589261001937448
Recall: 0.8856589147286822
F1-score: 0.8174094499925474


In [110]:
y_resampled.value_counts()

0    15447
1    15447
Name: smoking, dtype: int64

In [111]:
from sklearn.decomposition import PCA
pca=PCA(n_components=.95)
X_transformed=pca.fit_transform(X)

In [112]:
X_transformed.shape

(45965, 3)

In [113]:
from sklearn.preprocessing import StandardScaler
X=df1.drop("smoking",axis=1)
y=df1['smoking']

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)


In [114]:
pca1=PCA(n_components=.95)
x_train=pca1.fit_transform(X_train)
x_test=pca1.transform(X_test)


In [115]:
# after applying pca we will perform DT
from sklearn.tree import DecisionTreeClassifier

In [116]:
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)
predicted=dt.predict(x_test)
confusion_matrix(predicted,y_test)
accuracy_score(predicted,y_test)

0.5414310081694945

In [117]:
x_train

array([[-13508.57830383],
       [ 17800.41995119],
       [ -4680.58099822],
       ...,
       [  8640.41941836],
       [ 22238.41968116],
       [ -5762.58069883]])

In [118]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=1)
X_train_lda = lda.fit_transform(X_train, y_train)  # Assuming you have y_train as the target variable
X_test_lda = lda.transform(X_test)

In [119]:
X_test_lda

array([[ 1.61114681],
       [ 0.83883922],
       [-1.57521285],
       ...,
       [ 0.34107678],
       [-1.59082752],
       [-1.13560356]])

In [120]:
coefficients = lda.coef_[0]  # Assuming there's only one LD for binary classification

# Sort the coefficients in descending order and get the indices of the top 10 coefficients
top_10_indices = coefficients.argsort()[::-1][:10]

# Get the top 10 coefficients and their corresponding feature names (if available)
top_10_coefficients = coefficients[top_10_indices]
feature_names=X.columns
top_10_feature_names = [feature_names[i] for i in top_10_indices]
f=[]
print("Top 10 Coefficients:")
for feature_name, coef in zip(top_10_feature_names, top_10_coefficients):
    print(f"{feature_name}: {coef}")
    f.append(feature_name)

Top 10 Coefficients:
gender_M: 2.539621524208546
tartar_Y: 0.3509523668003667
dental caries: 0.34026297365882474
hemoglobin: 0.12403649692142917
Urine protein: 0.04588936026628015
height(cm): 0.02520074126213541
hearing(right): 0.020539906253755034
relaxation: 0.01086635327048072
Gtp: 0.006131063461831262
triglyceride: 0.005457304051942648


In [191]:
f.append("gender_M")


In [192]:
f.

SyntaxError: invalid syntax (3717109820.py, line 1)

In [197]:
from sklearn.preprocessing import StandardScaler
X1=df4[f]
y1=df4['smoking']
# Split the data into training and testing sets (e.g., 80% train, 20% test)
from imblearn.under_sampling import RandomUnderSampler

undersampler = RandomUnderSampler()

# Resample the data to balance the class distribution
X_resampled1, y_resampled1 = undersampler.fit_resample(X1, y1)

X_train, X_test, y_train, y_test = train_test_split(X_resampled1, y_resampled1, test_size=0.2, random_state=10)

# Optionally, perform feature scaling if needed
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# Create and train the Logistic Regression model
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)

# Step 4: Train Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

# Step 5: Make Predictions
# Predict using both models
logistic_predictions = lr.predict(X_test_scaled)
rf_prediction1=rf_model.predict(X_train)
rf_predictions = rf_model.predict(X_test)

# Step 6: Evaluate the Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the Logistic Regression model
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
#logistic_precision = precision_score(y_test, logistic_predictions)
#logistic_recall = recall_score(y_test, logistic_predictions)
#logistic_f1 = f1_score(y_test, logistic_predictions)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_train, rf_prediction1)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

# Print the evaluation metrics for both models
print("Logistic Regression:")
print("Accuracy:", logistic_accuracy)
print("Precision:", logistic_precision)
print("Recall:", logistic_recall)
print("F1-score:", logistic_f1)

print("\nRandom Forest:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)


Logistic Regression:
Accuracy: 0.7685709661757566
Precision: 0.7004362578768784
Recall: 0.9334625322997416
F1-score: 0.8003323179174744

Random Forest:
Accuracy: 1.0
Precision: 0.7576413308087638
Recall: 0.9047157622739018
F1-score: 0.8246724569409687


In [196]:

X1

Unnamed: 0,tartar_Y,dental caries,hemoglobin,Urine protein,height(cm),hearing(right),relaxation,Gtp,triglyceride,serum creatinine,systolic,gender_M
0,1,0,12.9,1.0,155,1.0,73.0,27.0,82.0,0.7,114.0,0
1,1,0,12.7,1.0,160,1.0,70.0,18.0,115.0,0.6,119.0,0
2,0,0,15.8,1.0,170,1.0,86.0,22.0,182.0,1.0,138.0,1
3,1,0,14.7,1.0,165,1.0,60.0,18.0,254.0,1.0,100.0,1
4,0,0,12.5,1.0,155,1.0,74.0,22.0,74.0,0.6,120.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
55686,1,0,12.6,1.0,150,1.0,60.0,14.0,53.0,0.8,102.0,0
55687,1,1,12.3,1.0,170,1.0,68.0,10.0,99.0,0.6,110.0,0
55688,1,0,14.0,1.0,160,1.0,62.0,14.0,69.0,0.9,101.0,0
55690,0,0,14.4,1.0,165,1.0,76.0,18.0,79.0,0.7,133.0,1


In [164]:
import matplotlib.pyplot as plt


AttributeError: module 'matplotlib' has no attribute 'get_data_path'

In [198]:
import pickle
pickle.dump(lr,open('md.pkl','wb'))

In [None]:
X