# Importation des librairies

In [12]:
# WARNINGS
import warnings
warnings.filterwarnings('ignore')

# NUMPY
import numpy as np

# STATS
import scipy.stats as stats
from scipy.stats import norm, skew

# MATPLOTLIB
import matplotlib as mlp
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight') 

# PANDAS
import pandas as pd 
pd.set_option("display.max_rows", None, "display.max_columns", None) 

# SEABORN
import seaborn as sns

# SCIKIT-LEARN: SELECTION DE VARIABLES

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# SCIKIT-LEARN: PRE-PROCESSING
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder # Encodage des variables catégorielles ordinales
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder # Encodage des variables catégorielles nominales
from sklearn.preprocessing import StandardScaler # Normalisation des variables numériques
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer # Imputation
from sklearn.impute import KNNImputer 

# SCIKIT-LEARN: MODELES
from sklearn.linear_model import LogisticRegression

# SCIKIT-LEARN: VALIDATION CROISEE + OPTIMISATION
from sklearn.model_selection import train_test_split # Séparer en données train et test
from sklearn.model_selection import cross_val_score # Validation croisée pour comparison entre modèles
from sklearn.model_selection import validation_curve # Courbe de validation : visulaisr des scores lors du choix d'un hyper-paramètre
from sklearn.model_selection import GridSearchCV # Tester plusieurs hyper-paramètres
from sklearn.model_selection import learning_curve # Courbe d'apprentissage : visualisation des scores du train et du validation sets en fonction des quanitiés des données
 
## EVALUATION
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# SCHIKIT-LEARN: PIPELINE AND TRANSFORMATEURll
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer


# Analyse exploratoire

## Chargement des données

In [13]:
data = pd.read_csv('smoking_drinking_dataset_Ver01.csv')
df_smoking_drinking = data.copy()

## Description du dataset

### Informations globales

In [14]:
df_smoking_drinking.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,80.0,99.0,193.0,48.0,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,Y
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,82.0,106.0,228.0,55.0,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,N
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,70.0,98.0,136.0,41.0,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,N
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,87.0,95.0,201.0,76.0,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,N
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,82.0,101.0,199.0,61.0,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,N


In [15]:
df_smoking_drinking.tail()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
991341,Male,45,175,80,92.1,1.5,1.5,1.0,1.0,114.0,80.0,88.0,198.0,46.0,125.0,132.0,15.0,1.0,1.0,26.0,36.0,27.0,1.0,N
991342,Male,35,170,75,86.0,1.0,1.5,1.0,1.0,119.0,83.0,83.0,133.0,40.0,84.0,45.0,15.8,1.0,1.1,14.0,17.0,15.0,1.0,N
991343,Female,40,155,50,68.0,1.0,0.7,1.0,1.0,110.0,70.0,90.0,205.0,96.0,77.0,157.0,14.3,1.0,0.8,30.0,27.0,17.0,3.0,Y
991344,Male,25,175,60,72.0,1.5,1.0,1.0,1.0,119.0,74.0,69.0,122.0,38.0,73.0,53.0,14.5,1.0,0.8,21.0,14.0,17.0,1.0,N
991345,Male,50,160,70,90.5,1.0,1.5,1.0,1.0,133.0,79.0,99.0,225.0,39.0,153.0,163.0,15.8,1.0,0.9,24.0,43.0,36.0,3.0,Y


In [16]:
df_smoking_drinking.shape

(991346, 24)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991346 entries, 0 to 991345
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sex               991346 non-null  object 
 1   age               991346 non-null  int64  
 2   height            991346 non-null  int64  
 3   weight            991346 non-null  int64  
 4   waistline         991346 non-null  float64
 5   sight_left        991346 non-null  float64
 6   sight_right       991346 non-null  float64
 7   hear_left         991346 non-null  float64
 8   hear_right        991346 non-null  float64
 9   SBP               991346 non-null  float64
 10  DBP               991346 non-null  float64
 11  BLDS              991346 non-null  float64
 12  tot_chole         991346 non-null  float64
 13  HDL_chole         991346 non-null  float64
 14  LDL_chole         991346 non-null  float64
 15  triglyceride      991346 non-null  float64
 16  hemoglobin        99

### Type des variables

In [18]:
df_smoking_drinking.dtypes

sex                  object
age                   int64
height                int64
weight                int64
waistline           float64
sight_left          float64
sight_right         float64
hear_left           float64
hear_right          float64
SBP                 float64
DBP                 float64
BLDS                float64
tot_chole           float64
HDL_chole           float64
LDL_chole           float64
triglyceride        float64
hemoglobin          float64
urine_protein       float64
serum_creatinine    float64
SGOT_AST            float64
SGOT_ALT            float64
gamma_GTP           float64
SMK_stat_type_cd    float64
DRK_YN               object
dtype: object

In [19]:
df_smoking_drinking.dtypes.value_counts()

float64    19
int64       3
object      2
Name: count, dtype: int64

Variables explicatives du dataset : 
- Sex : male, female [catégorielle]
- age : âge de l'individu (arrondi à 5 ans près) [continue]
- height : taille de l'individu (arrondi à 5 cm près) [cm] [continue]
- weight : poids [kg] [continue]
- sight_left : eyesight(left) [continue]
- sight_right : eyesight(right)	[continue]
- hear_left : ouïe gauche, 1(normale), 2(anormale) [catégorielle]
- hear_right : ouïe droite, 1(normale), 2(anormale) [catégorielle]
- SBP : pression artérielle systolique [mmHg] [continue]
- DBP :  pression artérielle diastolique [mmHg] [continue]
- BLDS : glycémie à jeun [mg/dL] [continue]
- tot_chole	: cholesterol total [mg/dL] [continue]
- HDL_chole : taux de cholesterol HDL ("bon" cholesterol) [mg/dL] [continue]
- LDL_chole	: taux de cholesterol LDL ("mauvais" cholesterol) [mg/dL] [continue]
- triglyceride : triglyceride [mg/dL] [continue]
- hemoglobin : hemoglobine [g/dL] [continue]
- urine_protein : protein in urine, 1(-), 2(+/-), 3(+1), 4(+2), 5(+3), 6(+4) [catégorielle ordinale]
- serum_creatinine : creatinine sanguine [mg/dL] [continue]
- SGOT_AST : SGOT (transaminase glutamique-oxaloacétique sérique) AST(Aspartate transaminase)[IU/L] [continue]
- SGOT_ALT : ALT (alanine aminotransférase)[IU/L] [continue]
- gamma_GTP : y-glutamyl transpeptidase[IU/L] [continue]

Variables cibles : 
- SMK_stat_type_cd : Smoking state, 1(never), 2(used to smoke but quit), 3(still smoke) [catégorielle]
- DRK_YN : Drinker or Not [catégorielle]

Besoin de caster les variables :
- de int à float : âge, height, weight
- de float à int : hear_left, hear_right, urine_protein

In [26]:
# Vérifier l'existence des duplications d'exemples 
duplicate_df_smoking_drinking = df_smoking_drinking[df_smoking_drinking.duplicated()]
duplicate_df_smoking_drinking
# df_smoking_drinking.duplicated().sum()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
159911,Female,40,170,85,88.0,0.9,0.9,1.0,1.0,120.0,70.0,110.0,191.0,47.0,121.0,115.0,10.4,1.0,0.9,17.0,14.0,33.0,1.0,N
175152,Male,65,170,75,101.1,0.6,0.7,1.0,1.0,130.0,70.0,114.0,175.0,38.0,109.0,140.0,17.1,1.0,1.0,24.0,31.0,26.0,3.0,N
246305,Female,50,155,70,90.8,1.0,1.0,1.0,1.0,150.0,96.0,101.0,230.0,43.0,150.0,183.0,14.9,1.0,0.8,24.0,22.0,42.0,1.0,N
280830,Male,45,170,75,86.4,1.2,0.7,1.0,1.0,150.0,78.0,195.0,222.0,55.0,122.0,499.0,14.9,1.0,0.8,23.0,11.0,342.0,3.0,Y
284528,Female,65,150,55,86.0,0.9,0.9,1.0,1.0,120.0,65.0,99.0,228.0,62.0,139.0,136.0,11.9,1.0,0.7,27.0,18.0,14.0,1.0,N
290463,Female,20,160,50,70.0,1.0,1.0,1.0,1.0,106.0,68.0,76.0,154.0,45.0,98.0,56.0,12.7,1.0,0.8,18.0,13.0,11.0,1.0,N
335747,Male,50,180,95,101.0,1.5,1.5,1.0,2.0,177.0,111.0,141.0,184.0,43.0,120.0,105.0,15.0,3.0,0.9,19.0,27.0,61.0,3.0,Y
429596,Male,75,160,60,83.0,1.2,0.7,1.0,1.0,105.0,70.0,114.0,218.0,52.0,150.0,82.0,16.0,2.0,1.1,27.0,22.0,26.0,1.0,N
453451,Male,35,170,65,85.0,0.9,1.2,1.0,1.0,130.0,78.0,99.0,149.0,44.0,80.0,126.0,14.7,1.0,0.9,14.0,13.0,10.0,2.0,Y
471596,Female,45,165,65,82.0,1.0,1.0,1.0,1.0,120.0,80.0,87.0,178.0,64.0,103.0,53.0,13.6,1.0,0.5,17.0,19.0,28.0,1.0,N
