In [1421]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import json
import re
from sklearn.preprocessing import StandardScaler
from IPython.core.interactiveshell import InteractiveShell
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
InteractiveShell.ast_node_interactivity = "all"
standard_X = StandardScaler()

In [1422]:
dfp = pd.read_csv('data-set/personal_valid.csv')
dfo = pd.read_csv('data-set/other_valid.csv')
dfp = dfp.rename(columns={'Unnamed: 0': 'ID'})
dfo = dfo.rename(columns={'Unnamed: 0': 'ID'})
pd.set_option('display.max_columns', 30)

## Integrácia dát a deduplikácia záznamov

Táto časť je skopírovaná z predchadzajúceho notebooku 'Prieskumná_analýza.ipynb'. Spojí dataset other a personal do jedného a následne odstráni duplikatne záznamy. 

In [1423]:
# merges duplicates
def merge(name):
    df = dfo_duplicates.loc[dfo_duplicates['name'] == name]
    return df.groupby(['name'], as_index=False).first()

def merge_other_personal(df):
    x = df.copy()

    dfo_duplicates = dfo[dfo.duplicated(['name'], keep=False)]
    dfo_unique = dfo.drop_duplicates(subset=["name"], keep=False)
    merged = []

    for name in dfo_duplicates['name'].unique():
        merged.append(merge(name))

    dfo_unique = dfo_unique.append(merged)

    x = pd.merge(dfp, dfo_unique, on='name')
    x = x.drop(columns=['address_y', 'ID_y'])
    x = x.rename(columns={"ID_x": "ID", "address_x": "address"})
    return x
    
df = merge_other_personal(df)

AttributeError: 'NoneType' object has no attribute 'copy'

Dalsia vec ktoru musime spravit, je rozbalenie medical info. Nachadza sa tu: kurtosis_oxygen, mean_oxygen, skewness_oxygen a std_oxygen. Tieto hodnoty su ulozene ako object (string), pre to ich budeme musiet previest na cislo.

In [None]:
def unpack_medical(df):
    x = df.copy()
    for i, row in x.iterrows():
        if not pd.isnull(x.at[i, 'medical_info']):
            x.at[i, 'medical_info'] = json.loads(x["medical_info"][i].replace("\'", "\""))
    # vytvorenie stlpcov z medical_info a ich spojenie so zvyskom dataframe
    df_med_info = x["medical_info"].apply(pd.Series)
    df_med_info = df_med_info.drop(0, 1)
    x = pd.concat([x, df_med_info], axis = 1).drop("medical_info", axis = 1)
    return x

df = unpack_medical(df)

Zmena kurtosis_oxygen, mean_oxygen, skewness_oxygen, std_oxygen z object na float

In [None]:
def obj_to_float(df):
    x = df.copy()
    # kurtosis_oxygen
    x['kurtosis_oxygen'] = x['kurtosis_oxygen'].astype(np.float)
    # mean_oxygen
    x['mean_oxygen'] = x['mean_oxygen'].astype(np.float)
    # skewness_oxygen
    x['skewness_oxygen'] = x['skewness_oxygen'].astype(np.float)
    # std_oxygen
    x['std_oxygen'] = x['std_oxygen'].astype(np.float)
    return x
df = obj_to_float(df)

Rozhodli sme sa dropnut niektore stlpce. Meno, ID, fnlwgt. Dovodom je, ze kazda z tychto hodnot je rozna a na vyskyt cukrovky nema vplyv.  
Mame 2 udaje ktore nam ukazuju vek cloveka - age a date_of_birth. Kedze nemame ziadne nullove hodnoty pri age, nemusime podla datumu narodenia vek urcovat. Datum narodenia teda mozeme vyhodit

In [None]:
def remove_unimportant_columns(df):
    x = df.copy()
    x = x.drop(['ID', 'name', 'fnlwgt', 'date_of_birth'], axis=1)
    return x
df = remove_unimportant_columns(df)

Odstranenie space-ov z nazvov atributov

In [None]:
def remove_space (df):
    x = df.copy()
    x = df.apply(lambda y: y.str.strip() if y.dtype == "object" else y)
    return x

df = remove_space(df)

Nahradenie hodnot, ktore mozu nadobudat len 2 stavy

In [None]:
def put_0_1_values(df):
    x = df.copy()
    
    # pohlavia: Male -> 1; Female -> 0
    x['sex'] = x['sex'].replace('Male', 1)
    x['sex'] = x['sex'].replace('Female', 0)
    
    # tehotnost: T -> 1; F -> 0
    x['pregnant'] = x['pregnant'].replace(regex='(?i)f.*', value=0)
    x['pregnant'] = x['pregnant'].replace(regex='(?i)t.*', value=1)
    
    # muzi oznaceni ako tehotny su prepisani na 0
    x.loc[(x['pregnant'] == 1) & (x['sex'] == 1), 'pregnant'] = 0
    
    # zmena income hodnot: <=50K -> 0; >50K -> 1
    x['income'] = x['income'].replace('<=50K', 0)
    x['income'] = x['income'].replace('>50K', 1)
    
    # zmena nazvov stlpcov na presnejsie
    x = x.rename(columns={"pregnant": "is_pregnant", "income": "income_>50K"})
    
    return x

df = put_0_1_values(df)

**Education a education-num**  
Zistujeme, co znamena education a education num. Predpoklad je, ze education-num je numericka reprezentacia education

In [None]:
def education_analysis(df):
    # prints unique values in education
    x = df.copy()
    unique_edu = pd.unique(x['education'])
    print("Pred zjednotenim:\n", unique_edu)
    
    # Zjednotenie reprezentacii
    x['education'] = x['education'].replace(regex='(?i)_', value='-')
    unique_edu = pd.unique(x['education'])
    print("\nPo zjednoteni:\n", unique_edu)
    
    # hodnoty v education-num a v education
    print("\nHodnoty v jednotlivych education:")
    for item in unique_edu:
        edu_num = x.query("education == @item")["education-num"].unique()
        print(item, edu_num)

education_analysis(df)

Vidime ze education-num je ciselna reprezentacia education. Vytvorime si dictionary, ktory bude priradovat education ku education-num, s tym ze nechame celociselne reprezentacie od 1 po 16.

In [None]:
#vrati unikatne hodnoty v stlpci education
def get_unique_edu(df):
    x = df.copy()
    x['education'] = x['education'].replace(regex='(?i)_', value='-')
    unique_edu = pd.unique(x['education'])
    return unique_edu

# rozne hodnoty education-num pre unikatny education zmeni na jedno (napr.: 5th-6th [  3. 300.] -> 3)
def get_edu_num(edu_num):
    for num in edu_num:
        if num is None:
            continue
        elif num < 100:
            return int(num)

def transform_education(df):
    x = df.copy()
    edu_to_num = {}
    #vytvorenie dictionary s moznymi hodnotami v jendotlivych education values
    for item in get_unique_edu(x):
        edu_num = x.query("education == @item")["education-num"].unique()
        edu_to_num[item] = get_edu_num(edu_num)
    
    # zmena moznych hodnot v education na rovnake
    x['education'] = x['education'].replace(regex='(?i)_', value='-')

    # namapuje nazvy education na cisla z dictionary
    x["education"] = x["education"].map(edu_to_num)
    
    # Dropne nepotrebny column education-num (bol nahradeny)
    x = x.drop(['education-num'], axis=1)
    return x
    
df = transform_education(df)

Cele adresy nam netreba pretoze 

In [None]:
def find_state(address):
    i = re.search('\x5cn.+\D', address)
    return address[i.start():i.end()][-3:-1]
    #return address[-8:][:2]

def address_to_state(df):
    x = df.copy()
    x['address'] = x['address'].apply(find_state)
    x = x.rename(columns={"address": "state"})
    return x

df = address_to_state(df)

Otazniky zmenime na NaN 

In [None]:
def replace_with_nan(df):
    df.replace(['??', '?'], np.nan)
    
df = replace_with_nan(df)

In [None]:
df

Income vieme zmenit na 2 hodnoty: <=50K = 0; >50K = 1. Mame jednu hodnotu NaN a nevieme ju predikovat.

In [None]:
# df['income'].unique()
# df_null = df[df['income'].isnull()]
# df_null
# df_tetst = df.loc[df['occupation'] == 'Sales']
# df_tetst = df_tetst.loc[df_tetst['hours-per-week'] > 45.0]
# df_tetst[['age', 'sex', 'race', 'marital-status', 'occupation', 'income', 'hours-per-week', 'education', 'workclass', 'relationship']]

In [None]:
# df['workclass'].unique()

In [None]:
# df.loc[df['native-country'] == '?'][['address', 'native-country']]

### Zmena na ciselne hodnoty

## Chybajuce hodnoty a outliers

### Chybajuce hodnoty

### Outliers

### Pipeline

In [None]:
# pipeline = Pipeline([
#     ('test', FunctionTransformer(
#         func=put_0_1_values                        # function to be used
#         )) # parameters to the function
# ])
# df = pipeline.fit_transform(df)
# df