In [37]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# Model
from sklearn.linear_model import LogisticRegression

from helpers import get_score

### Ładujemy z pliku dataset

In [38]:
data_train = pd.read_csv("data/train_data.csv")
data_test = pd.read_csv("data/test_data.csv")
cardio_data.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,glucose,lifestyle,is_cardio_ill
0,19546,158.0,76.0,150,90,female,normal,normal,,1
1,17650,169.0,72.0,120,80,male,normal,way above normal,active,0
2,18960,157.0,53.0,120,70,female,normal,normal,active,0
3,21213,154.0,62.0,150,90,male,way above normal,way above normal,active,0
4,18491,165.0,65.0,120,80,female,normal,normal,active,0


### Co przewidujemy?

In [39]:
data_train.is_cardio_ill.value_counts()

1    10507
0    10493
Name: is_cardio_ill, dtype: int64

Widzimy, że klasy są raczej zbalansowane, więc możemy tu użyć **accuracy_score**.

### TODO: Drop missing values there

### Co będzie, jeśli od razu coś wytrenujemy?

Skoro nasza regresja logistyczna nie działa z wartościami tekstowymi, na razie nie bierzmy ich pod uwagę.

In [40]:
numeric_cardio_data = cardio_data.drop(
    ['cholesterol', 'glucose', 'lifestyle', "gender"],
    axis=1
)

In [41]:
numeric_cardio_data.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,is_cardio_ill
0,19546,158.0,76.0,150,90,1
1,17650,169.0,72.0,120,80,0
2,18960,157.0,53.0,120,70,0
3,21213,154.0,62.0,150,90,0
4,18491,165.0,65.0,120,80,0


In [44]:
train_X, val_X, train_y, val_y = train_test_split(
    numeric_cardio_data.drop("is_cardio_ill", axis=1),
    numeric_cardio_data['is_cardio_ill'],
    test_size=0.2
)

In [45]:
baseline_model = LogisticRegression()

baseline_model.fit(train_X, train_y)
baseline_preds = baseline_model.predict(test_X)

acc = accuracy_score(test_y, baseline_preds)
print(f"Accuracy score: {round(100 * acc, 2)} %")

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Not bad, but can we do better?

<img src="https://i.pinimg.com/736x/a8/ef/58/a8ef58e480da3676dc81f0ffc191807b--swimmer-girl-problems-jeep-humor.jpg" width=400 height=400 />

# Exploratory Data Analysis (EDA)

In [None]:
# TODO: What is it and how it is used?

### Jakie widzimy tutaj typy atrybutów?

In [2]:
cardio_data.dtypes

NameError: name 'cardio_data' is not defined

In [None]:
numeric_features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical_features = ["cholesterol", "glucose", "lifestyle"]

In [None]:
cardio_data.info()

In [None]:
cardio_data.describe()

### # TODO: handle missing values

### Handling errors in data

In [3]:
cardio_data['age_in_years'] = cardio_data.age // 365

NameError: name 'cardio_data' is not defined

In [None]:
cardio_data.age_in_years.hist()
print(f"Max age is {cardio_data.age_in_years.max()}")
print(f"Minimum age is {cardio_data.age_in_years.min()}")

In [None]:
def info_num_feature(series: pd.Series):
    
    print(series.name)
    print(f"Max: {series.max()}")
    print(f"Min: {series.min()}")
    print()

In [None]:
for feature_name in numeric_features:
    info_num_feature(cardio_data[feature_name])

Widzimy, że w przypadku atrybutów z ciśnieniem mamy pewne błędy. Przyjrzyjmy się im bardziej szczegółowo:

In [4]:
print(f"Minimal systolic pressure is : {cardio_data.ap_hi.min()}")
print(f"Minimal diastolic pressure is : {cardio_data.ap_lo.min()}")

cardio_data.ap_hi = cardio_data.ap_hi.apply(np.abs)
cardio_data.ap_lo = cardio_data.ap_lo.apply(np.abs)

print(f"Minimal systolic pressure is: {cardio_data.ap_hi.min()}")
print(f"Minimal diastolic pressure is : {cardio_data.ap_lo.min()}")

NameError: name 'cardio_data' is not defined

In [None]:
### Removing outliers

In [None]:
SYSTOLIC_THR_MAX = 230
DIASTOLIC_THR_MAX = 150

SYSTOLIC_THR_MIN = 80
DIASTOLIC_THR_MIN = 40

In [None]:
bad_systolic_data = cardio_data[(cardio_data.ap_hi > SYSTOLIC_THR_MAX) | (cardio_data.ap_hi < SYSTOLIC_THR_MIN)]
cardio_data.drop(bad_systolic_data.index, inplace=True)

bad_diastolic_data = cardio_data[(cardio_data.ap_lo > DIASTOLIC_THR_MAX) | (cardio_data.ap_lo < DIASTOLIC_THR_MIN)]
cardio_data.drop(bad_diastolic_data.index, inplace=True)

### Removing pressure data errors

In [None]:
invalid_pressure = cardio_data[cardio_data.ap_lo >= cardio_data.ap_hi]
cardio_data.drop(invalid_pressure.index, inplace=True)

In [None]:
numeric_data = cardio_data[['age_in_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'cardio']]
numeric_data

In [None]:
train_X, test_X, train_y, test_y = train_test_split(
    numeric_data.drop('cardio', axis=1),
    numeric_data['cardio']
)

In [None]:
get_score(train_X, test_X, train_y, test_y)

In [None]:
### Categorical variables

In [None]:
cat_data = cardio_data[categorical_features]
cat_data.info()

#### Ordinal encoding

In [None]:
ord_encoder = OrdinalEncoder()

cat_data1 = cat_data.copy()
ordinal_cols = ['cholesterol', 'glucose']

# applying
cat_data1[ordinal_cols] = ord_encoder.fit_transform(cat_data[ordinal_cols])

#### One Hot Encoding

In [None]:
# TODO: Make it better!!!

oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

oh_columns = ['lifestyle']

oh_data = pd.DataFrame(oh_encoder.fit_transform(cat_data1[oh_columns]))

# bring back the index
oh_data.index = cat_data1.index
oh_data

In [None]:
# TODO: Wróć do podziału na train i test, w czym jest problem? (Walidacja krzyżowa)