In [125]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [126]:
cardio_data = pd.read_csv('data/cardio_train.csv', delimiter=";")

In [127]:
messy_data = cardio_data[['age', 'height', 'weight', 'ap_hi', 'ap_lo']]

In [128]:
cardio_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [129]:
print(cardio_data[cardio_data.gender == 1].height.mean()) # female
print(cardio_data[cardio_data.gender == 1].weight.mean())

161.35561168460356
72.5656050955414


In [130]:
print(cardio_data[cardio_data.gender == 2].height.mean()) # male
print(cardio_data[cardio_data.gender == 2].weight.mean())

169.94789538210054
77.257306906416


In [131]:
gender_data = ["male" if gender == 2 else "female" for gender in cardio_data.gender]
messy_data['gender'] = gender_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [132]:
messy_data

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender
0,18393,168,62.0,110,80,male
1,20228,156,85.0,140,90,female
2,18857,165,64.0,130,70,female
3,17623,169,82.0,150,100,male
4,17474,156,56.0,100,60,female
...,...,...,...,...,...,...
69995,19240,168,76.0,120,80,male
69996,22601,158,126.0,140,90,female
69997,19066,183,105.0,180,90,male
69998,22431,163,72.0,135,80,female


In [133]:
def make_cat(num: int):
    if num == 1:
        return "normal"
    elif num == 2:
        return "above normal"
    elif num == 3:
        return "way above normal"

In [134]:
messy_data['cholesterol'] = cardio_data['cholesterol'].apply(make_cat)
messy_data['glucose'] = cardio_data['gluc'].apply(make_cat)

messy_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,glucose
0,18393,168,62.0,110,80,male,normal,normal
1,20228,156,85.0,140,90,female,way above normal,normal
2,18857,165,64.0,130,70,female,way above normal,normal
3,17623,169,82.0,150,100,male,normal,normal
4,17474,156,56.0,100,60,female,normal,normal
...,...,...,...,...,...,...,...,...
69995,19240,168,76.0,120,80,male,normal,normal
69996,22601,158,126.0,140,90,female,above normal,above normal
69997,19066,183,105.0,180,90,male,way above normal,normal
69998,22431,163,72.0,135,80,female,normal,above normal


In [135]:
def make_oh_cat(values: list):
    string_components = []
    if values[0] != 0:
        string_components.append("smoker")
    
    if values[1] != 0:
        string_components.append("alcoholic")
    
    if values[2] != 0:
        string_components.append("active")
    
    if string_components != []:
        final_string = ", ".join(string_components)
    else:
        final_string = np.nan
    
    return final_string

In [136]:
messy_data['is_cardio_ill'] = cardio_data['cardio']
messy_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,glucose,is_cardio_ill
0,18393,168,62.0,110,80,male,normal,normal,0
1,20228,156,85.0,140,90,female,way above normal,normal,1
2,18857,165,64.0,130,70,female,way above normal,normal,1
3,17623,169,82.0,150,100,male,normal,normal,1
4,17474,156,56.0,100,60,female,normal,normal,0
...,...,...,...,...,...,...,...,...,...
69995,19240,168,76.0,120,80,male,normal,normal,0
69996,22601,158,126.0,140,90,female,above normal,above normal,1
69997,19066,183,105.0,180,90,male,way above normal,normal,1
69998,22431,163,72.0,135,80,female,normal,above normal,1


In [137]:
messy_data['lifestyle'] = cardio_data[['smoke', 'alco', 'active']].apply(make_oh_cat, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [138]:
messy_data

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,glucose,is_cardio_ill,lifestyle
0,18393,168,62.0,110,80,male,normal,normal,0,active
1,20228,156,85.0,140,90,female,way above normal,normal,1,active
2,18857,165,64.0,130,70,female,way above normal,normal,1,
3,17623,169,82.0,150,100,male,normal,normal,1,active
4,17474,156,56.0,100,60,female,normal,normal,0,
...,...,...,...,...,...,...,...,...,...,...
69995,19240,168,76.0,120,80,male,normal,normal,0,"smoker, active"
69996,22601,158,126.0,140,90,female,above normal,above normal,1,active
69997,19066,183,105.0,180,90,male,way above normal,normal,1,alcoholic
69998,22431,163,72.0,135,80,female,normal,above normal,1,


In [139]:
# TODO: make mising values in height / weight

In [140]:
train_X, test_X, train_y, test_y = train_test_split(messy_data.drop('is_cardio_ill', axis=1), messy_data['is_cardio_ill'], test_size=0.7)

In [141]:
train_data = pd.concat([train_X, train_y], axis=1)
test_data = pd.concat([test_X, test_y], axis=1)
test_data.to_csv('data/test_data.csv', index=False)

In [142]:
weight_missing = np.random.choice(train_data.index, int(2.89 * len(train_data) / 100))
height_missing = np.random.choice(train_data.index, int(2.31 * len(train_data) / 50))

In [143]:
train_data.loc[weight_missing, "weight"] = np.nan
train_data.loc[height_missing, "height"] = np.nan

In [144]:
# TODO: save a dataset train, test 30 x 70

In [145]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21000 entries, 45909 to 43881
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            21000 non-null  int64  
 1   height         20048 non-null  float64
 2   weight         20401 non-null  float64
 3   ap_hi          21000 non-null  int64  
 4   ap_lo          21000 non-null  int64  
 5   gender         21000 non-null  object 
 6   cholesterol    21000 non-null  object 
 7   glucose        21000 non-null  object 
 8   lifestyle      17278 non-null  object 
 9   is_cardio_ill  21000 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 2.3+ MB


In [146]:
train_data.to_csv("data/train_data.csv", index=False)