In [438]:
import numpy as np
import pandas as pd

In [439]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

from sklearn.linear_model import  LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [440]:
df = pd.read_csv('https://raw.githubusercontent.com/MachineLearnia/Python-Machine-Learning/master/Dataset/dataset.csv',encoding='ISO-8859-1')

In [441]:
df.head()[:100]

Unnamed: 0,Patient ID,Patient age quantile,SARS-Cov-2 exam result,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,...,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),Arteiral Fio2,Phosphor,ctO2 (arterial blood gas analysis)
0,44477f75e8169d2,13,negative,0,0,0,,,,,...,,,,,,,,,,
1,126e9dd13932f68,17,negative,0,0,0,0.236515,-0.02234,-0.517413,0.010677,...,,,,,,,,,,
2,a46b4402a0e5696,8,negative,0,0,0,,,,,...,,,,,,,,,,
3,f7d619a94f97c45,5,negative,0,0,0,,,,,...,,,,,,,,,,
4,d9e41465789c2b5,15,negative,0,0,0,,,,,...,,,,,,,,,,


In [442]:
variables_infos = {}
for column in df.columns:
    variables_infos[str(df[column].dtype)] = []

In [443]:
for column in df.columns:
    if df[column].nunique() == 0 or df[column].nunique() == 1:
        df.drop(columns=column,axis=1,inplace=True)
        print('La colonne', column, 'a été supprimée.' + '\n')
    else:
        variables_infos[str(df[column].dtype)].append(column)

for elements in variables_infos:
    print(elements + '\n' + ':')
    for variables in variables_infos[elements]:
        print('La colonne ' + variables + ' possède ' + str(df[variables].nunique()) + ' modalités :' + str(df[variables].unique()[:20]) + '\n')


La colonne Mycoplasma pneumoniae a été supprimée.

La colonne Parainfluenza 2 a été supprimée.

La colonne Fio2 (venous blood gas analysis) a été supprimée.

La colonne Myeloblasts a été supprimée.

La colonne Urine - Nitrite a été supprimée.

La colonne Urine - Sugar a été supprimée.

La colonne Urine - Hyaline cylinders a été supprimée.

La colonne Urine - Granular cylinders a été supprimée.

La colonne Urine - Yeasts a été supprimée.

La colonne Partial thromboplastin time (PTT)  a été supprimée.

La colonne Prothrombin time (PT), Activity a été supprimée.

La colonne D-Dimer a été supprimée.

object
:
La colonne Patient ID possède 5644 modalités :['44477f75e8169d2' '126e9dd13932f68' 'a46b4402a0e5696' 'f7d619a94f97c45'
 'd9e41465789c2b5' '75f16746216c4d1' '2a2245e360808d7' '509197ec73f1400'
 '8bb9d64f0215244' '5f1ed301375586c' 'd720464cc322b6f' '02c84e61d5f9e18'
 'b68a6f32fd88a49' '0fbafd910aa8078' 'f9056545d1296ec' '6c9d3323975b082'
 'b62e45a8a3f683e' '99da3c055438133' 'd3ea751f3db

In [444]:
variables_infos['object']

['Patient ID',
 'SARS-Cov-2 exam result',
 'Respiratory Syncytial Virus',
 'Influenza A',
 'Influenza B',
 'Parainfluenza 1',
 'CoronavirusNL63',
 'Rhinovirus/Enterovirus',
 'Coronavirus HKU1',
 'Parainfluenza 3',
 'Chlamydophila pneumoniae',
 'Adenovirus',
 'Parainfluenza 4',
 'Coronavirus229E',
 'CoronavirusOC43',
 'Inf A H1N1 2009',
 'Bordetella pertussis',
 'Metapneumovirus',
 'Influenza B, rapid test',
 'Influenza A, rapid test',
 'Strepto A',
 'Urine - Esterase',
 'Urine - Aspect',
 'Urine - pH',
 'Urine - Hemoglobin',
 'Urine - Bile pigments',
 'Urine - Ketone Bodies',
 'Urine - Urobilinogen',
 'Urine - Protein',
 'Urine - Leukocytes',
 'Urine - Crystals',
 'Urine - Color']

In [445]:
df.drop(columns='Patient ID',axis=1,inplace=True)
variables_infos['object'].remove('SARS-Cov-2 exam result')

In [446]:
df['Urine - pH'].unique()


array([nan, '6.5', '6.0', 'Não Realizado', '5.0', '7.0', '5', '5.5',
       '7.5', '6', '8.0', '7'], dtype=object)

In [447]:
df['Urine - Leukocytes'].unique()

array([nan, '38000', '5942000', '32000', '22000', '<1000', '3000',
       '16000', '7000', '5300', '1000', '4000', '5000', '10600', '6000',
       '2500', '2600', '23000', '124000', '8000', '29000', '2000',
       '624000', '40000', '3310000', '229000', '19000', '28000', '10000',
       '4600', '77000', '43000'], dtype=object)

In [448]:
df['Urine - Leukocytes'] =  df['Urine - Leukocytes'].replace('<1000',float(500))
df['Urine - pH'] = df['Urine - pH'].replace('Não Realizado', np.nan)

In [449]:
df['Urine - Leukocytes'].unique()

array([nan, '38000', '5942000', '32000', '22000', 500.0, '3000', '16000',
       '7000', '5300', '1000', '4000', '5000', '10600', '6000', '2500',
       '2600', '23000', '124000', '8000', '29000', '2000', '624000',
       '40000', '3310000', '229000', '19000', '28000', '10000', '4600',
       '77000', '43000'], dtype=object)

In [450]:
df['Urine - pH'].unique()

array([nan, '6.5', '6.0', '5.0', '7.0', '5', '5.5', '7.5', '6', '8.0',
       '7'], dtype=object)

In [451]:
for column in df.columns:
    try:
        df[column] = df[column].astype(float)
    except:
        pass

In [452]:
df.dtypes

Patient age quantile                                     float64
SARS-Cov-2 exam result                                    object
Patient addmited to regular ward (1=yes, 0=no)           float64
Patient addmited to semi-intensive unit (1=yes, 0=no)    float64
Patient addmited to intensive care unit (1=yes, 0=no)    float64
                                                          ...   
HCO3 (arterial blood gas analysis)                       float64
pO2 (arterial blood gas analysis)                        float64
Arteiral Fio2                                            float64
Phosphor                                                 float64
ctO2 (arterial blood gas analysis)                       float64
Length: 98, dtype: object

In [453]:
df.head()

Unnamed: 0,Patient age quantile,SARS-Cov-2 exam result,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,...,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),Arteiral Fio2,Phosphor,ctO2 (arterial blood gas analysis)
0,13.0,negative,0.0,0.0,0.0,,,,,,...,,,,,,,,,,
1,17.0,negative,0.0,0.0,0.0,0.236515,-0.02234,-0.517413,0.010677,0.102004,...,,,,,,,,,,
2,8.0,negative,0.0,0.0,0.0,,,,,,...,,,,,,,,,,
3,5.0,negative,0.0,0.0,0.0,,,,,,...,,,,,,,,,,
4,15.0,negative,0.0,0.0,0.0,,,,,,...,,,,,,,,,,


for elements in variables_infos:
    for variables in variables_infos[elements]:
        if elements == 'object':
            df.fillna(df.groupby('SARS-Cov-2 exam result')[variables].transform(lambda x: x.mode()[0]))
        if elements == 'int64':
            df.fillna(df.groupby('SARS-Cov-2 exam result')[variables].transform('mean'))

        else:
            df.fillna(df.groupby('SARS-Cov-2 exam result')[variables].transform('mean'))

In [454]:
categorical_features = variables_infos['object']
categorical_features.remove('Patient ID')

In [455]:
numerical_features = variables_infos['float64']

In [456]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())

])

In [457]:
categorical_transformer = Pipeline(steps=[

    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))

])

In [458]:
preprocessor = ColumnTransformer(transformers=[

    ('numeric', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)

])

In [459]:
classifiers = [

    ('Logistic Regression', LogisticRegression(max_iter=10000)),
    ('Random Forest', RandomForestClassifier())

]

In [460]:
params_grid = {
    'Logistic Regression': {
        'preprocessor__numeric__imputer__strategy': ['mean', 'median'],
        'classifiers__C': [0.1, 1, 10],
    },

    'Random Forest': {
        'preprocessor__numeric__imputer__strategy': ['mean', 'median'],
        'classifiers__n_estimators': [50, 100, 200],
        'classifiers__max_depth': [None, 10, 20]
    }
}

In [461]:
X = df.drop(columns='SARS-Cov-2 exam result',axis=1)
y = df['SARS-Cov-2 exam result']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

results = {}
predictions = {}
for classifier_name, classifier in tqdm(classifiers):

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifiers', classifier)
    ])

    model = GridSearchCV(pipeline, params_grid[classifier_name])
    model = model.fit(X_train,y_train)

    y_preds = model.predict(X_test)
    predictions[classifier_name] = y_preds

    results[classifier_name] = {'Meilleurs paramètres' : model.best_params_,
                                'Accuracy sur train' : model.best_score_,
                                'Accuracy sur test' : accuracy_score(y_test,predictions[classifier_name]),
                                'Meilleur modèle' : model.best_estimator_}




100%|██████████| 2/2 [00:47<00:00, 23.63s/it]


In [463]:
results['Logistic Regression']['Accuracy sur test']

0.9069973427812223

In [464]:
results['Random Forest']['Accuracy sur test']

0.9025686448184234

In [465]:
from sklearn.metrics import classification_report

In [467]:
print(classification_report(y_test, predictions['Logistic Regression']))

              precision    recall  f1-score   support

    negative       0.91      1.00      0.95      1018
    positive       0.71      0.09      0.16       111

    accuracy                           0.91      1129
   macro avg       0.81      0.54      0.56      1129
weighted avg       0.89      0.91      0.87      1129



In [468]:
print(classification_report(y_test, predictions['Random Forest']))

              precision    recall  f1-score   support

    negative       0.90      1.00      0.95      1018
    positive       1.00      0.01      0.02       111

    accuracy                           0.90      1129
   macro avg       0.95      0.50      0.48      1129
weighted avg       0.91      0.90      0.86      1129

