In [3]:
# Imports standards
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn: model selection
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    learning_curve,
    train_test_split,
    validation_curve,
)

# Scikit-learn: preprocessing
from sklearn.preprocessing import (
    Binarizer,
    KBinsDiscretizer,
    LabelEncoder,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    RobustScaler,
    StandardScaler,
)

# Scikit-learn: pipeline & composition
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_selector, make_column_transformer

# Scikit-learn: imputation
from sklearn.impute import SimpleImputer

# Scikit-learn: feature selection
from sklearn.feature_selection import (
    RFECV,
    RFE,
    SelectFromModel,
    SelectKBest,
    VarianceThreshold,
    chi2,
)

# Scikit-learn: models
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    GradientBoostingClassifier,
    IsolationForest,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)

# Scikit-learn: decomposition
from sklearn.decomposition import PCA

# Scikit-learn: metrics
from sklearn.metrics import (
    confusion_matrix,
    mean_absolute_error,
    mean_squared_error,
    median_absolute_error,
)

In [None]:
#Importe le dataset covid 19 et affichage des données du dataset
data = pd.read_csv("../data/covid-19.csv", encoding="latin1")
print(data.shape)
display(data.describe())
display(data)


print(f"positive {data['SARS-Cov-2 exam result'].value_counts()}")

Nan_columns = data.columns[data.isna().sum() > data.shape[0] - 558]
print([data.columns.get_loc(name_col) + 1 for name_col in Nan_columns])
data.iloc[:, 7].isna().sum()

(5644, 111)


Unnamed: 0,Patient age quantile,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,...,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),Arteiral Fio2,Phosphor,ctO2 (arterial blood gas analysis)
count,5644.0,5644.0,5644.0,5644.0,603.0,603.0,602.0,599.0,602.0,602.0,...,27.0,27.0,27.0,27.0,27.0,27.0,27.0,20.0,20.0,27.0
mean,9.318391,0.013997,0.008859,0.007264,-2.187396e-09,-1.598342e-08,-3.820598e-10,7.373957e-09,8.416944e-09,-7.863787e-09,...,-1.851852e-10,8.37037e-09,-1.703704e-09,3.333333e-10,-7.444444e-09,6e-09,-2.481481e-08,4.65e-09,6.25e-09,5.185185e-09
std,5.777903,0.117489,0.093713,0.084929,1.00083,1.00083,1.000832,1.000836,1.000832,1.000832,...,1.019049,1.019049,1.019049,1.019049,1.019049,1.019049,1.019049,1.025978,1.025978,1.019049
min,0.0,0.0,0.0,0.0,-4.50142,-4.345603,-2.552426,-2.457575,-3.970608,-1.86507,...,-1.99956,-1.244817,-3.082674,-3.568877,-2.925618,-2.985592,-1.175907,-1.532932,-1.480526,-2.900254
25%,4.0,0.0,0.0,0.0,-0.5188074,-0.5862439,-0.6053457,-0.6624832,-0.5679496,-0.7307069,...,-1.122574,-0.5348102,-0.3308668,-0.09210583,-0.511772,-0.5397211,-0.8169898,-0.1214975,-0.5527296,-0.4852787
50%,9.0,0.0,0.0,0.0,0.05340703,0.04031596,-0.121716,-0.1015171,0.01385207,-0.01426696,...,0.2677689,-0.2120799,-0.01181667,0.2942021,0.07743482,0.05633191,-0.1599549,-0.01174366,-0.1381825,0.1826928
75%,14.0,0.0,0.0,0.0,0.7171751,0.729532,0.5314981,0.6838353,0.6661759,0.5976919,...,0.7383496,0.02305214,0.6661649,0.5115003,0.4385609,0.50851,0.450009,-0.01174366,0.2763648,0.5937525
max,19.0,1.0,1.0,1.0,2.662704,2.671868,9.532034,3.713052,3.645706,3.7641,...,1.337265,3.236524,1.703078,1.042674,1.940087,2.029471,2.205371,2.841856,2.86235,1.826932


Unnamed: 0,Patient ID,Patient age quantile,SARS-Cov-2 exam result,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,...,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),Arteiral Fio2,Phosphor,ctO2 (arterial blood gas analysis)
0,44477f75e8169d2,13,negative,0,0,0,,,,,...,,,,,,,,,,
1,126e9dd13932f68,17,negative,0,0,0,0.236515,-0.022340,-0.517413,0.010677,...,,,,,,,,,,
2,a46b4402a0e5696,8,negative,0,0,0,,,,,...,,,,,,,,,,
3,f7d619a94f97c45,5,negative,0,0,0,,,,,...,,,,,,,,,,
4,d9e41465789c2b5,15,negative,0,0,0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5639,ae66feb9e4dc3a0,3,positive,0,0,0,,,,,...,,,,,,,,,,
5640,517c2834024f3ea,17,negative,0,0,0,,,,,...,,,,,,,,,,
5641,5c57d6037fe266d,4,negative,0,0,0,,,,,...,,,,,,,,,,
5642,c20c44766f28291,10,negative,0,0,0,,,,,...,,,,,,,,,,


positive SARS-Cov-2 exam result
negative    5086
positive     558
Name: count, dtype: int64
[21, 28, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111]


5041