In [152]:
import numpy as np
import pandas as pd
import pandas_profiling
import math
import re

In [153]:
def is_number(s):
    """ Returns True is string is a number. """
    try:
        float(s)
        return True
    except ValueError:
        return False

In [154]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [155]:
ntrain = train.shape[0]
ntest = train.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['stroke_in_2018'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

all_data size is : (43590, 13)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


### Data Cleaning

In [156]:
def is_job_status_and_living_area_reversed(x):
    return ((x["job_status"] != None and x["job_status"] in ("r", "c", "city", "remote", "remotee"))
             or (x["living_area"] != None and x["living_area"] in ("private_sector", "business_owner")))
def process_job_status(x):
    if x == None or x in ("nan", 'null', "", 'n.a'):
        return None
    elif x in ("private sector", "privattte", "private", "private_sector"):
        return "private_sector"
    elif x in ("government", "govt."):
        return "government"
    elif x in ("business_owner", "business owner", "biz"):
        return "business_owner"
    elif x in ("parental_leave", "parental leave"):
        return "parental_leave"
    else:
        return x
    
def process_living_area(x):
    if x == None or x in ("nan", 'null', "", 'n.a'):
        return None
    elif x == 'c':
        return 'city'
    elif x in ('r', 'remotee'):
        return 'remote'
    else:
        return x

def split_job_status_and_living_area(x):
    pair = x.lower().split("?") if x != None else [x, x]
    if len(pair) < 2:
        pair = [pair[0], None]
    return pair

def process_job_status_and_living_area(df):
    df["job_status"] = df["job_status and living_area"].astype(str).apply(split_job_status_and_living_area).apply(lambda x: x[0])
    df["living_area"] = df["job_status and living_area"].astype(str).apply(split_job_status_and_living_area).apply(lambda x: x[1])
    job_status = df.apply(lambda x: x["living_area"] if is_job_status_and_living_area_reversed(x) else x["job_status"], 1)
    living_area = df.apply(lambda x: x["job_status"] if is_job_status_and_living_area_reversed(x) else x["living_area"], 1)
    df["job_status"] = job_status.apply(lambda x: process_job_status(x))
    df["living_area"] = living_area.apply(lambda x: process_living_area(x))
    df.drop(columns='job_status and living_area',inplace=True)

In [157]:
process_job_status_and_living_area(all_data)
#process_job_status_and_living_area(test)

In [158]:
def process_smoker_status(x):
    if x == None:
        return None
    elif x.startswith("non"):
        return "non-smoker"
    elif x.startswith("quit"):
        return "quit"
    elif x.startswith("active"):
        return "active_smoker"
    else:
        return None
all_data["smoker_status"] = all_data["smoker_status"].astype(str).apply(process_smoker_status)
#test["smoker_status"] = test["smoker_status"].astype(str).apply(process_smoker_status)

Convert BMI to numeric

In [159]:
all_data["BMI"] = pd.to_numeric(all_data["BMI"],errors="coerce")

Treat '.,' as Missing Values

In [161]:
def replace_invalid_binary_values(df,columns):
    for col in columns:
        df[~df[col].isin(["0","1"])] = None
#replace_invalid_binary_values(all_data,["high_BP","stroke_in_2018","married"])
replace_invalid_binary_values(all_data,["high_BP","heart_condition_detected_2017","married"])

Clean Sex and Age

In [162]:
def clean_sex_age(sex_age_list):
    if type(sex_age_list) is not list:
        return [None,None]
    # Strip and Upper case both sex and age
    sex_age_list[0],sex_age_list[1] = sex_age_list[0].strip().upper(), sex_age_list[1].strip().upper()
    
    # 2nd : first one is empty and second one is not numeric
    if (is_number(sex_age_list[0]) or (not sex_age_list[0] and not is_number(sex_age_list[1]))): 
        sex_age_list = sex_age_list[::-1]
    sex = sex_age_list[0].strip().upper()
    
    # Take
    if sex in ('FEMALE','FEMALLE'):
        sex = 'F'
    if sex in ('MALE','MMALE','MM'):
        sex = 'M'

    sex_age_list[0] = sex
    sex_age_list[1] = sex_age_list[1]
    return sex_age_list
def process_sex_age(df):
    df["sex_age_list"] = df["sex and age"].str.split(",").apply(clean_sex_age)
    df[['sex','age']] = pd.DataFrame(df["sex_age_list"].values.tolist(), index= df.index)
    df["age"] = pd.to_numeric(df["age"],errors='coerce').round()
    return df.drop(columns=["sex_age_list","sex and age"])
all_data = process_sex_age(all_data)

In [169]:
def merge(grp):
    df = pd.DataFrame()
    if(grp.shape[0] > 1):
        for c in grp.columns:
            value_counts = grp[c].value_counts().index.astype(grp[c].dtypes)
            if value_counts.size > 1:
                print(grp, value_counts) #Error
            elif value_counts.size == 1:
                df[c] = value_counts[0]
            else:
                df[c] = None
    else:
        df = grp.head(1)
    return df

all_data = all_data.groupby("id").apply(merge).set_index("id")

In [181]:
def convert_to_categorical(df, columns):
    for col in columns:
        df[col] = df[col].astype('category')
convert_to_categorical(all_data,["heart_condition_detected_2017","married","high_BP","job_status",
                                 "sex","smoker_status","living_area"])

In [149]:
#all_data.drop_duplicates(inplace=True)

In [171]:
def split_train_test(df):
    return df[:ntrain], df[ntrain:]

In [182]:
preprocessed_train, preprocessed_test = split_train_test(all_data)

### Missing Values

In [202]:
def impute_by_mode(df,columns):
    for col in columns:
        df[col] = df[col].fillna(all_data[col].mode().iloc[0])

In [203]:
impute_by_mode(all_data,["heart_condition_detected_2017","high_BP","married"])

In [206]:
all_data["sex"] = all_data["sex"].fillna("OTHER")

In [207]:
profile = all_data.profile_report(title='Medical Record Profiling Report')
profile.to_file(output_file="train_data_summary.html")

(using `df.profile_report(correlations={"cramers": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/pandas-profiling/pandas-profiling/issues
(include the error message: 'The internally computed table of expected frequencies has a zero element at (0, 0).')
  correlation_name=correlation_name, error=error
