In [1531]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import json
import re
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from IPython.core.interactiveshell import InteractiveShell
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
InteractiveShell.ast_node_interactivity = "all"

In [1456]:
dfp = pd.read_csv('data-set/personal_valid.csv')
dfo = pd.read_csv('data-set/other_valid.csv')
dfp = dfp.rename(columns={'Unnamed: 0': 'ID'})
dfo = dfo.rename(columns={'Unnamed: 0': 'ID'})
pd.set_option('display.max_columns', 30)

## Integrácia dát a deduplikácia záznamov

Táto časť je skopírovaná z predchadzajúceho notebooku 'Prieskumná_analýza.ipynb'. Spojí dataset other a personal do jedného a následne odstráni duplikatne záznamy. 

In [1457]:
# merges duplicates
def merge(name):
    df = dfo_duplicates.loc[dfo_duplicates['name'] == name]
    return df.groupby(['name'], as_index=False).first()

def merge_other_personal(df):
    dfo_duplicates = dfo[dfo.duplicated(['name'], keep=False)]
    dfo_unique = dfo.drop_duplicates(subset=["name"], keep=False)
    merged = []

    for name in dfo_duplicates['name'].unique():
        merged.append(merge(name))

    dfo_unique = dfo_unique.append(merged)

    x = pd.merge(dfp, dfo_unique, on='name')
    x = x.drop(columns=['address_y', 'ID_y'])
    x = x.rename(columns={"ID_x": "ID", "address_x": "address"})
    return x
    
df = merge_other_personal(df)

Dalsia vec ktoru musime spravit, je rozbalenie medical info. Nachadza sa tu: kurtosis_oxygen, mean_oxygen, skewness_oxygen a std_oxygen. Tieto hodnoty su ulozene ako object (string), pre to ich budeme musiet previest na cislo.

In [1458]:
def unpack_medical(df):
    x = df.copy()
    for i, row in x.iterrows():
        if not pd.isnull(x.at[i, 'medical_info']):
            x.at[i, 'medical_info'] = json.loads(x["medical_info"][i].replace("\'", "\""))
    # vytvorenie stlpcov z medical_info a ich spojenie so zvyskom dataframe
    df_med_info = x["medical_info"].apply(pd.Series)
    df_med_info = df_med_info.drop(0, 1)
    x = pd.concat([x, df_med_info], axis = 1).drop("medical_info", axis = 1)
    return x

df = unpack_medical(df)

Zmena kurtosis_oxygen, mean_oxygen, skewness_oxygen, std_oxygen z object na float

In [1459]:
def obj_to_float(df):
    x = df.copy()
    # kurtosis_oxygen
    x['kurtosis_oxygen'] = x['kurtosis_oxygen'].astype(np.float)
    # mean_oxygen
    x['mean_oxygen'] = x['mean_oxygen'].astype(np.float)
    # skewness_oxygen
    x['skewness_oxygen'] = x['skewness_oxygen'].astype(np.float)
    # std_oxygen
    x['std_oxygen'] = x['std_oxygen'].astype(np.float)
    return x
df = obj_to_float(df)

Rozhodli sme sa dropnut niektore stlpce. Meno, ID, fnlwgt. Dovodom je, ze kazda z tychto hodnot je rozna a na vyskyt cukrovky nema vplyv.  
Mame 2 udaje ktore nam ukazuju vek cloveka - age a date_of_birth. Kedze nemame ziadne nullove hodnoty pri age, nemusime podla datumu narodenia vek urcovat. Datum narodenia teda mozeme vyhodit

In [1460]:
def remove_unimportant_columns(df):
    x = df.copy()
    x = x.drop(['ID', 'name', 'fnlwgt', 'date_of_birth'], axis=1)
    return x
df = remove_unimportant_columns(df)

Odstranenie space-ov z nazvov atributov

In [1461]:
def remove_space (df):
    x = df.copy()
    x = df.apply(lambda y: y.str.strip() if y.dtype == "object" else y)
    return x

df = remove_space(df)

Nahradenie hodnot, ktore mozu nadobudat len 2 stavy

In [1462]:
def put_0_1_values(df):
    x = df.copy()
    
    # pohlavia: Male -> 1; Female -> 0
    x['sex'] = x['sex'].replace('Male', 1)
    x['sex'] = x['sex'].replace('Female', 0)
    
    # tehotnost: T -> 1; F -> 0
    x['pregnant'] = x['pregnant'].replace(regex='(?i)f.*', value=0)
    x['pregnant'] = x['pregnant'].replace(regex='(?i)t.*', value=1)
    
    # muzi oznaceni ako tehotny su prepisani na 0
    x.loc[(x['pregnant'] == 1) & (x['sex'] == 1), 'pregnant'] = 0
    
    # zmena income hodnot: <=50K -> 0; >50K -> 1
    x['income'] = x['income'].replace('<=50K', 0)
    x['income'] = x['income'].replace('>50K', 1)
    
    # zmena nazvov stlpcov na presnejsie
    x = x.rename(columns={"pregnant": "is_pregnant", "income": "income_>50K"})
    
    return x

df = put_0_1_values(df)

**Education a education-num**  
Zistujeme, co znamena education a education num. Predpoklad je, ze education-num je numericka reprezentacia education

In [1463]:
def education_analysis(df):
    # prints unique values in education
    x = df.copy()
    unique_edu = pd.unique(x['education'])
    print("Pred zjednotenim:\n", unique_edu)
    
    # Zjednotenie reprezentacii
    x['education'] = x['education'].replace(regex='(?i)_', value='-')
    unique_edu = pd.unique(x['education'])
    print("\nPo zjednoteni:\n", unique_edu)
    
    # hodnoty v education-num a v education
    print("\nHodnoty v jednotlivych education:")
    for item in unique_edu:
        edu_num = x.query("education == @item")["education-num"].unique()
        print(item, edu_num)

education_analysis(df)

Pred zjednotenim:
 ['Assoc-voc' 'Some-college' 'Assoc_voc' 'HS_grad' 'HS-grad' 'Bachelors'
 'Assoc-acdm' 'Doctorate' 'Some_college' '10th' '7th-8th' 'Masters' '11th'
 '12th' 'Prof-school' '1st-4th' '5th-6th' '9th' '1st_4th' '7th_8th'
 'Assoc_acdm' 'Prof_school' '5th_6th' 'Preschool']

Po zjednoteni:
 ['Assoc-voc' 'Some-college' 'HS-grad' 'Bachelors' 'Assoc-acdm' 'Doctorate'
 '10th' '7th-8th' 'Masters' '11th' '12th' 'Prof-school' '1st-4th'
 '5th-6th' '9th' 'Preschool']

Hodnoty v jednotlivych education:
Assoc-voc [ 1100.    11.    nan -1100.]
Some-college [   10.    nan  1000. -1000.]
HS-grad [   9.   nan  900. -900.]
Bachelors [   13. -1300.  1300.    nan]
Assoc-acdm [   12. -1200.    nan  1200.]
Doctorate [   16. -1600.    nan]
10th [   6.  600.   nan -600.]
7th-8th [   4.   nan -400.  400.]
Masters [   14.    nan -1400.  1400.]
11th [   7.  700. -700.   nan]
12th [800.  nan   8.]
Prof-school [15.]
1st-4th [ 2. nan]
5th-6th [  3. 300.]
9th [5.]
Preschool [1.]


Vidime ze education-num je ciselna reprezentacia education. Vytvorime si dictionary, ktory bude priradovat education ku education-num, s tym ze nechame celociselne reprezentacie od 1 po 16.

In [1464]:
#vrati unikatne hodnoty v stlpci education
def get_unique_edu(df):
    x = df.copy()
    x['education'] = x['education'].replace(regex='(?i)_', value='-')
    unique_edu = pd.unique(x['education'])
    return unique_edu

# rozne hodnoty education-num pre unikatny education zmeni na jedno (napr.: 5th-6th [  3. 300.] -> 3)
def get_edu_num(edu_num):
    for num in edu_num:
        if num is None:
            continue
        elif num < 100:
            return int(num)

def transform_education(df):
    x = df.copy()
    edu_to_num = {}
    #vytvorenie dictionary s moznymi hodnotami v jendotlivych education values
    for item in get_unique_edu(x):
        edu_num = x.query("education == @item")["education-num"].unique()
        edu_to_num[item] = get_edu_num(edu_num)
    
    # zmena moznych hodnot v education na rovnake
    x['education'] = x['education'].replace(regex='(?i)_', value='-')

    # namapuje nazvy education na cisla z dictionary
    x["education"] = x["education"].map(edu_to_num)
    
    # Dropne nepotrebny column education-num (bol nahradeny)
    x = x.drop(['education-num'], axis=1)
    return x
    
df = transform_education(df)

Cele adresy nam netreba pretoze 

In [1465]:
def find_state(address):
    i = re.search('\x5cn.+\D', address)
    return address[i.start():i.end()][-3:-1]
    #return address[-8:][:2]

def address_to_state(df):
    x = df.copy()
    x['address'] = x['address'].apply(find_state)
    x = x.rename(columns={"address": "state"})
    return x

df = address_to_state(df)

Otazniky zmenime na NaN 

In [1466]:
def replace_with_nan(df):
    x = df.copy()
    x = x.replace(['??', '?'], np.nan)
    return x
df = replace_with_nan(df)

Zjednotime hodnoty vo workclass (local-gov a Local-gov)

In [1507]:
def transform_workclass(df):
    x = df.copy()
    x['workclass'] = x['workclass'].str.lower()
    return x
df = transform_workclass(df)

array(['private', 'local-gov', nan, 'self-emp-not-inc', 'state-gov',
       'federal-gov', 'self-emp-inc'], dtype=object)

Vytvorenie funkcie spajajucej integraciu dat do jednej

### Zmena na ciselne hodnoty

## Chybajuce hodnoty a outliers

### Chybajuce hodnoty

Rozdelime si hodnoty na strings a numericke

In [1573]:
def separate_by_dtype(df):
    df_num = pd.DataFrame()
    df_str = pd.DataFrame()

    for col in df:
        if df[col].dtypes in ['float64', 'int64']:
            df_num[col] = df[col]
        else:
            df_str[col] = df[col]
    
    return df_num, df_str

In [1571]:
def replace_missing_strings(df):
        x = df.copy()
        x = SimpleImputer(strategy="most_frequent").fit_transform(x)
        
        x = pd.DataFrame(x)
        x.columns = df.columns
        x.index = df.index
        
        return x

def replace_missing_numbers(df, strat='median'):
    x = df.copy()
    
    if strat in ['mean', 'median']:
        imp = SimpleImputer(strategy=strat)
    else:
        imp = KNNImputer()
    x = imp.fit_transform(x)
    
    x = pd.DataFrame(x)
    x.columns = df.columns
    x.index = df.index
    
    x['class'] = x['class'].round()
    x['income_>50K'] = x['income_>50K'].round()
    
    return x

In [None]:
def replace_missing_values(df, strat='median'):
    df_num, df_str = separate_by_dtype(df)
    
    df_str = replace_missing_strings(df_str)
    df_num = replace_missing_numbers(df_num, strat)
    
    x = pd.concat([df_str, df_num], axis=1, sort=False)

In [1559]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1311 entries, 0 to 1310
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1311 non-null   int64  
 1   sex               1311 non-null   int64  
 2   is_pregnant       1311 non-null   int64  
 3   skewness_glucose  1310 non-null   float64
 4   mean_glucose      1311 non-null   float64
 5   capital-gain      1308 non-null   float64
 6   kurtosis_glucose  1311 non-null   float64
 7   education         1311 non-null   int64  
 8   class             1310 non-null   float64
 9   std_glucose       1310 non-null   float64
 10  income_>50K       1310 non-null   float64
 11  hours-per-week    1311 non-null   float64
 12  capital-loss      1311 non-null   float64
 13  kurtosis_oxygen   1310 non-null   float64
 14  mean_oxygen       1310 non-null   float64
 15  skewness_oxygen   1310 non-null   float64
 16  std_oxygen        1310 non-null   float64


### Outliers

### Pipeline

In [1470]:
# pipeline = Pipeline([
#     ('test', FunctionTransformer(
#         func=put_0_1_values                        # function to be used
#         )) # parameters to the function
# ])
# df = pipeline.fit_transform(df)
# df