# Data cleaning and preparation

In [None]:
###### Pre-treatment
# Import Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
# Loading the drive
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
#Importing the data file
file_path = '/content/drive/MyDrive/sampled_data21.csv'
data = pd.read_csv(file_path, sep=';')

In [None]:
#Check if the data have been imported correctly
print(data.head())

   Unnamed: 0  hyperthyroidism  High_Blood_Pression  High_Cholesterol  \
0           1              0.0                  1.0               1.0   
1           2              0.0                  0.0               0.0   
2           3              0.0                  0.0               1.0   
3           4              0.0                  0.0               0.0   
4           5              0.0                  1.0               1.0   

   Cholesterol_Check  Body_Mass_Index  Smoking  Stroke  Heart_Disease  \
0                1.0             34.0      0.0     0.0            0.0   
1                1.0             46.0      0.0     0.0            0.0   
2                1.0             28.0      0.0     0.0            0.0   
3                1.0             24.0      1.0     0.0            0.0   
4                1.0             31.0      1.0     0.0            1.0   

   Physical_Activity  ...  Alcohol  Health_care  General_Health  \
0                1.0  ...      0.0          1.0        

In [None]:
print(data.describe())


         Unnamed: 0  hyperthyroidism  High_Blood_Pression  High_Cholesterol  \
count  20000.000000     20000.000000         20000.000000      20000.000000   
mean   10000.500000         0.135550             0.425600          0.422550   
std     5773.647028         0.342319             0.494446          0.493977   
min        1.000000         0.000000             0.000000          0.000000   
25%     5000.750000         0.000000             0.000000          0.000000   
50%    10000.500000         0.000000             0.000000          0.000000   
75%    15000.250000         0.000000             1.000000          1.000000   
max    20000.000000         1.000000             1.000000          1.000000   

       Cholesterol_Check  Body_Mass_Index       Smoking        Stroke  \
count       20000.000000     20000.000000  20000.000000  20000.000000   
mean            0.962550        28.353250      0.443900      0.041350   
std             0.189867         6.538004      0.496855      0.199103

In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           20000 non-null  int64  
 1   hyperthyroidism      20000 non-null  float64
 2   High_Blood_Pression  20000 non-null  float64
 3   High_Cholesterol     20000 non-null  float64
 4   Cholesterol_Check    20000 non-null  float64
 5   Body_Mass_Index      20000 non-null  float64
 6   Smoking              20000 non-null  float64
 7   Stroke               20000 non-null  float64
 8   Heart_Disease        20000 non-null  float64
 9   Physical_Activity    20000 non-null  float64
 10  Fruits               20000 non-null  float64
 11  Vegetables           20000 non-null  float64
 12  Alcohol              20000 non-null  float64
 13  Health_care          20000 non-null  float64
 14  General_Health       20000 non-null  float64
 15  Mental_Health        20000 non-null 

In [None]:
# Check of missing values
data.isna().sum()


Unnamed: 0,0
Unnamed: 0,0
hyperthyroidism,0
High_Blood_Pression,0
High_Cholesterol,0
Cholesterol_Check,0
Body_Mass_Index,0
Smoking,0
Stroke,0
Heart_Disease,0
Physical_Activity,0


In [None]:
# There is no missing values

In [None]:
# Count of the number of hyperthyroidism case in the dataset
data["hyperthyroidism"].value_counts()

Unnamed: 0_level_0,count
hyperthyroidism,Unnamed: 1_level_1
0.0,17289
1.0,2711


In [None]:
# There is a low proportion of sick invididuals in the dataset

In [None]:
print(data.dtypes)

Unnamed: 0               int64
hyperthyroidism        float64
High_Blood_Pression    float64
High_Cholesterol       float64
Cholesterol_Check      float64
Body_Mass_Index        float64
Smoking                float64
Stroke                 float64
Heart_Disease          float64
Physical_Activity      float64
Fruits                 float64
Vegetables             float64
Alcohol                float64
Health_care            float64
General_Health         float64
Mental_Health          float64
Physical_Health        float64
Diff_Walk              float64
Sex                    float64
Age                    float64
Education              float64
Income                 float64
dtype: object


# Data modelling and evaluations

In [None]:
# Separation of the output variable from the features
X = data.drop('hyperthyroidism', axis=1)
y = data['hyperthyroidism']


In [None]:
# Divide data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Random forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.87525
Confusion Matrix:
[[3432   57]
 [ 442   69]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.98      0.93      3489
         1.0       0.55      0.14      0.22       511

    accuracy                           0.88      4000
   macro avg       0.72      0.56      0.57      4000
weighted avg       0.84      0.88      0.84      4000



In [None]:
# The low proportion of sick individuals limits the predictive capacity of the model, which predicts a large number of false negatives.
# We will therefore resample the dataset to obtain an equivalent number of healthy and sick individuals.

In [None]:
# resampling of the data using SMOTE
oversampler = SMOTE(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

print(pd.Series(y_resampled).value_counts())


hyperthyroidism
0.0    17289
1.0    17289
Name: count, dtype: int64


In [None]:
# By resampling the dataset using SMOTE, which generates examples based on existing ones, we obtain an equal number of sick and healthy people.
# We will now test different algorithms to determine which one best predicts that someone is positive for hyperthyroidism on the basis of the characteristics observed.

In [None]:
# Divide the data into training and test sets with the data resampled
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Random forest Initialisation and training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.7524580682475419
Confusion Matrix:
[[3452   17]
 [1695 1752]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.67      1.00      0.80      3469
         1.0       0.99      0.51      0.67      3447

    accuracy                           0.75      6916
   macro avg       0.83      0.75      0.74      6916
weighted avg       0.83      0.75      0.74      6916



In [None]:
# Logistic Regression

# Initialise and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.6925968768074031
Confusion Matrix:
[[2364 1105]
 [1021 2426]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.68      0.69      3469
         1.0       0.69      0.70      0.70      3447

    accuracy                           0.69      6916
   macro avg       0.69      0.69      0.69      6916
weighted avg       0.69      0.69      0.69      6916



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Classification K-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
# Initialise and train the model
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.8445633314054367
Confusion Matrix:
[[2831  638]
 [ 437 3010]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.82      0.84      3469
         1.0       0.83      0.87      0.85      3447

    accuracy                           0.84      6916
   macro avg       0.85      0.84      0.84      6916
weighted avg       0.85      0.84      0.84      6916



In [None]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

# Initialise and train the model
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.9264025448235974
Confusion Matrix:
[[3351  118]
 [ 391 3056]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93      3469
         1.0       0.96      0.89      0.92      3447

    accuracy                           0.93      6916
   macro avg       0.93      0.93      0.93      6916
weighted avg       0.93      0.93      0.93      6916



We observed that the Gradient Boosting and Random Forest models performed well and similarly, but it was the Gradient Boosting model that performed best, with a correct prediction rate of almost 93%. It also has the highest f1-score when combining recall and accuracy for both sick and healthy individuals.

**I therefore recommend that the Pitié Salpêtrière Hospital use this Gradient Boosting model** to be able to predict that someone will be positive for hyperthyroidism based on observed attributes.