# Heart Attack Prediction

Pour ce qui est du pre-traitement des donnees, je remarque deja que la plupart des colonnes sont des colonnes sont deja normalisees ie se retrouvent entre 0 et 1, ce qui est recommandé lorsqu'il faut entrainer un modele de ML ou de DL je vais donc normaliser les autres colonnes dans un premier temps, encoder certaines variables qualitatives, selectionner les colonnes les plus importantes pour entrainer notre modele (ce qui ne se fera pas lors de ce travail), et peut etre bine verifier s'il ya des outliers.

## 2. Pre-processing

In [1]:
import pandas as pd


data = pd.read_csv("./Heart_attack_clean.csv")
df = data.copy()

In [2]:
df = df.drop('Unnamed: 0', axis = True)

In [3]:
df.head()

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Heart Attack Risk,Systolic Pressure,Diastolic Pressure
0,67,Male,208,1.795455,72,0,0,1,0,0,...,261404,31.251233,286,0,6,Argentina,South America,0,158,88
1,21,Male,389,1.774194,98,1,1,1,1,1,...,285768,27.194973,235,1,7,Canada,North America,0,165,93
2,21,Female,324,1.757576,72,1,0,0,0,0,...,235282,28.176571,587,4,4,France,Europe,0,174,99
3,84,Male,383,1.63,73,1,1,1,0,1,...,125640,36.464704,378,3,4,Canada,North America,0,163,100
4,66,Male,318,1.034091,93,1,1,1,1,0,...,160555,21.809144,231,1,5,Thailand,Asia,0,91,88


In [4]:
df['Diet'].value_counts()

Diet
Healthy      2960
Average      2912
Unhealthy    2891
Name: count, dtype: int64

## TrainTest - Nettoyage - Encodage

In [5]:
# df['Sex'] = df['Sex'].map({'Male': 1, 'Female' : 0})
# df['Diet'] = df['Diet'].map({"Average" : 2, "Unhealthy" : 0, "Healthy" : 1})
# df['Continent'] = df['Continent'].map({'South America':1, 'North America':2, 'Europe':3, 'Asia':4, 'Africa':5, 'Australia':6})

In [6]:
df.shape

(8763, 26)

In [7]:
from sklearn.model_selection import train_test_split


trainset, testset = train_test_split(df, test_size = 0.2, random_state = 0)

In [8]:
trainset['Heart Attack Risk'].value_counts()

Heart Attack Risk
0    4510
1    2500
Name: count, dtype: int64

In [9]:
testset['Heart Attack Risk'].value_counts()

Heart Attack Risk
0    1114
1     639
Name: count, dtype: int64

In [10]:
for col in df.select_dtypes('object'):
    print(f'{col :-<50} {df[col].unique()}')

Sex----------------------------------------------- ['Male' 'Female']
Diet---------------------------------------------- ['Average' 'Unhealthy' 'Healthy']
Country------------------------------------------- ['Argentina' 'Canada' 'France' 'Thailand' 'Germany' 'Japan' 'Brazil'
 'South Africa' 'United States' 'Vietnam' 'China' 'Italy' 'Spain' 'India'
 'Nigeria' 'New Zealand' 'South Korea' 'Australia' 'Colombia'
 'United Kingdom']
Continent----------------------------------------- ['South America' 'North America' 'Europe' 'Asia' 'Africa' 'Australia']


In [11]:
df.head()

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Heart Attack Risk,Systolic Pressure,Diastolic Pressure
0,67,Male,208,1.795455,72,0,0,1,0,0,...,261404,31.251233,286,0,6,Argentina,South America,0,158,88
1,21,Male,389,1.774194,98,1,1,1,1,1,...,285768,27.194973,235,1,7,Canada,North America,0,165,93
2,21,Female,324,1.757576,72,1,0,0,0,0,...,235282,28.176571,587,4,4,France,Europe,0,174,99
3,84,Male,383,1.63,73,1,1,1,0,1,...,125640,36.464704,378,3,4,Canada,North America,0,163,100
4,66,Male,318,1.034091,93,1,1,1,1,0,...,160555,21.809144,231,1,5,Thailand,Asia,0,91,88


In [12]:
df.dtypes.value_counts()

int64      18
object      4
float64     4
Name: count, dtype: int64

In [13]:
def preprocessing(df):
    X = df.drop('Heart Attack Risk', axis = 1)
    y = df['Heart Attack Risk']
    print(y.value_counts())
    
    return X, y

In [14]:
X_train, y_train = preprocessing(trainset)

Heart Attack Risk
0    4510
1    2500
Name: count, dtype: int64


In [15]:
X_test, y_test = preprocessing(testset)

Heart Attack Risk
0    1114
1     639
Name: count, dtype: int64


### Encodage

In [34]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [17]:
df.select_dtypes(object).columns

Index(['Sex', 'Diet', 'Country', 'Continent'], dtype='object')

In [18]:
ohe = OneHotEncoder()
ohe.fit(df[['Sex', 'Diet', 'Continent', 'Country']])

In [19]:
ohe.categories_

[array(['Female', 'Male'], dtype=object),
 array(['Average', 'Healthy', 'Unhealthy'], dtype=object),
 array(['Africa', 'Asia', 'Australia', 'Europe', 'North America',
        'South America'], dtype=object),
 array(['Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'Colombia',
        'France', 'Germany', 'India', 'Italy', 'Japan', 'New Zealand',
        'Nigeria', 'South Africa', 'South Korea', 'Spain', 'Thailand',
        'United Kingdom', 'United States', 'Vietnam'], dtype=object)]

In [20]:
columns_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_), ['Sex', 'Diet', 'Continent', 'Country']),
                                        remainder = 'passthrough')

In [21]:
lr = LinearRegression()
pipe = make_pipeline(columns_trans, lr)

In [28]:
pipe.fit(X_train, y_train)

In [31]:
y_pred = pipe.predict(X_test)

#### Modelisation

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
model = DecisionTreeClassifier(random_state = 0)

#### Procedure d'evaluation

In [25]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [26]:
def evaluation(model):
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))

In [27]:
evaluation(model)

ValueError: could not convert string to float: 'Male'

In [None]:
# df.loc[df['Diabetes'] == 'Male']

In [None]:
model.feature_importances_

AttributeError: 'DecisionTreeClassifier' object has no attribute 'tree_'