# Dataset other_train.csv
#### Anton Rusňák, František Gič

In [65]:
import pandas as pd
import numpy as np
import matplotlib as mat
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
%matplotlib inline

data_other = pd.read_csv("data/other_train.csv",sep=",")

In [66]:
#data_other = data_other.drop(data_other.columns[0],axis=1)


In [67]:
def sanitize_boolean(boolean):
    try:
        if boolean.strip() in ['f','F','FALSE','false','False']:
            return False
        elif boolean.strip() in ['t','T','TRUE','true','True']:
            return True
    except AttributeError:
        return None
    
data_other.pregnant = data_other.pregnant.apply(sanitize_boolean)


In [68]:
def parse_personal_info(personal_info):
    try:    
        return np.array(personal_info.replace(' -- ', ',').replace('|', ',').replace('\r\r\n', ',').split(','))
    except AttributeError:
        return None
    
def remove_empty(arr):
        if arr is not None:
            empty = ['?','??']
            for i in range(0,len(arr)):
                if (arr[i] != None):
                    if arr[i] in empty:
                        arr[i] = None
        return arr
    
def fill_value(number,arr):
    if arr is not None:
        return arr[number]

In [69]:
parsed_info = data_other.personal_info.apply(parse_personal_info).apply(remove_empty)

for index,name in enumerate(['employment','country','relationship_info','employment_info','race']):
    data_other[name] = parsed_info.apply(lambda x: fill_value(index,x))

In [70]:
data_duplicates = data_other[data_other.duplicated(subset='name', keep='first')]
#print(data_duplicates)

      Unnamed: 0                  name  \
611          611        Thomas Sanchez   
1011        1011          Scott Devine   
1056        1056           Brian Sayle   
1579        1579     Timothy Morrissey   
1612        1612            Jeff Clark   
1664        1664          Charles Ryan   
1782        1782             Van Mason   
1806        1806       Clarence Rivers   
1844        1844        Edward Peschel   
1916        1916        Timothy Sexton   
2032        2032           Robert Coen   
2146        2146         Jeffrey Silva   
2300        2300          Joseph Davis   
2328        2328        William Snyder   
2433        2433            Carl Gross   
2452        2452          Leslie Marks   
2641        2641        Michael Parker   
2744        2744          Randal Louis   
3031        3031         Jesse Garrett   
3039        3039       George Hairston   
3087        3087         Marcus Martin   
3124        3124            Ken Howard   
3142        3142         Matthew B

In [71]:
data_other.info()
data_other.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3983 entries, 0 to 3982
Data columns (total 27 columns):
Unnamed: 0           3983 non-null int64
name                 3983 non-null object
address              3983 non-null object
kurtosis_oxygen      3978 non-null float64
pregnant             3971 non-null object
education-num        3965 non-null float64
relationship         3970 non-null object
skewness_glucose     3960 non-null float64
mean_glucose         3973 non-null float64
std_oxygen           3969 non-null float64
capital-gain         3967 non-null float64
skewness_oxygen      3970 non-null float64
kurtosis_glucose     3965 non-null float64
personal_info        3971 non-null object
education            3971 non-null object
fnlwgt               3970 non-null float64
class                3964 non-null float64
std_glucose          3971 non-null float64
income               3969 non-null object
mean_oxygen          3967 non-null float64
hours-per-week       3970 non-null float64

Unnamed: 0.1,Unnamed: 0,name,address,kurtosis_oxygen,pregnant,education-num,relationship,skewness_glucose,mean_glucose,std_oxygen,...,std_glucose,income,mean_oxygen,hours-per-week,capital-loss,employment,country,relationship_info,employment_info,race
0,0,Alex Chinzi,"590 Samantha Bridge Apt. 572\r\nNorth Allison,...",8.286599,False,16.0,Husband,0.234418,107.484375,15.072284,...,44.487706,>50K,2.448161,50.0,0.0,Prof-specialty,United-States,Married-civ-spouse,Private,White
1,1,Scott Bass,"5384 Hurst Groves Apt. 092\r\nFreemanview, SD ...",6.45586,False,13.0,Own-child,0.15113,140.203125,21.925411,...,44.586531,<=50K,4.445652,15.0,0.0,Prof-specialty,United-States,Never-married,Private,White
2,2,Chuck Lao,USS Scott\r\nFPO AE 78885,11.513491,False,9.0,Unmarried,-0.024384,124.054688,11.822427,...,45.904165,<=50K,1.890468,40.0,0.0,Other-service,Mexico,Separated,Private,White
3,3,Floyd Squires,"122 Ryan Ranch Suite 621\r\nPort Douglas, NM 0...",7.641796,False,13.0,Not-in-family,-0.564287,115.046875,22.965651,...,58.140302,<=50K,3.639632,40.0,0.0,Prof-specialty,Canada,Never-married,Private,White
4,4,Kenneth Cadet,"30930 Ryan Groves\r\nKatelynside, MS 75220",8.826058,False,9.0,Husband,0.495896,93.335938,13.777158,...,45.605865,<=50K,2.51087,40.0,0.0,Sales,United-States,Married-civ-spouse,Private,White


In [72]:
aggregation_functions = {'name': 'first','address':'first', 'kurtosis_oxygen': 'first','pregnant': 'first','education-num': 'first',
                         'relationship': 'first','skewness_glucose': 'first','mean_glucose': 'first','std_oxygen': 'first',
                         'capital-gain': 'first','skewness_oxygen': 'first','kurtosis_glucose': 'first','personal_info': 'first',
                         'education': 'first','fnlwgt': 'first','class': 'first','std_glucose': 'first',
                         'income': 'first','mean_oxygen': 'first','hours-per-week': 'first','capital-loss': 'first' ,'employment': 'first' 
                         ,'country': 'first' ,'relationship_info': 'first' ,'employment_info': 'first' ,'race': 'first' }

data_other = data_other.groupby(data_other['name']).aggregate(aggregation_functions)

In [73]:
data_duplicates = data_other[data_other.duplicated(subset='name', keep='first')]
print(data_duplicates)

Empty DataFrame
Columns: [name, address, kurtosis_oxygen, pregnant, education-num, relationship, skewness_glucose, mean_glucose, std_oxygen, capital-gain, skewness_oxygen, kurtosis_glucose, personal_info, education, fnlwgt, class, std_glucose, income, mean_oxygen, hours-per-week, capital-loss, employment, country, relationship_info, employment_info, race]
Index: []

[0 rows x 26 columns]


In [74]:
data_personal = pd.read_csv("data/personal_train.csv",sep=",")

In [75]:
data_personal.head()

Unnamed: 0.1,Unnamed: 0,name,address,age,sex,date_of_birth
0,0,Roscoe Bohannon,"7183 Osborne Ways Apt. 651\r\nEast Andrew, OH ...",59.0,Male,1960-07-04
1,1,Ernest Kline,"391 Ball Road Suite 961\r\nFlowersborough, IN ...",47.0,Female,1972-07-20
2,2,Harold Hendriks,"8702 Vincent Square\r\nNew Jerryfurt, CO 30614",59.0,Male,1960-02-28
3,3,Randy Baptiste,"2751 Harris Crossroad\r\nWest Ashley, CA 30311",51.0,Female,1967-12-04
4,4,Anthony Colucci,"904 Robert Cliffs Suite 186\r\nWest Kyle, CO 7...",,Female,1938-04-22


In [76]:
data_personal = data_personal.drop(data_personal.columns[0],axis=1)

In [77]:
data_personal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3933 entries, 0 to 3932
Data columns (total 5 columns):
name             3933 non-null object
address          3933 non-null object
age              3343 non-null object
sex              3933 non-null object
date_of_birth    3933 non-null object
dtypes: object(5)
memory usage: 153.8+ KB


In [78]:
duplicateRowsDF = data_personal[data_personal.duplicated()]
print(duplicateRowsDF)

Empty DataFrame
Columns: [name, address, age, sex, date_of_birth]
Index: []


In [79]:
def reload_data():
    """Reloads the data from CSV file"""
    return pd.read_csv("data/personal_train.csv",sep=",",index_col=0)
    
data_personal = reload_data()


In [84]:
data_personal.index.name = None
data_other.index.name = None

data = pd.merge(data_personal,data_other ,left_on=['name','address'], right_on=['name','address'], how = 'outer')

In [87]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3933 entries, 0 to 3932
Data columns (total 29 columns):
name                 3933 non-null object
address              3933 non-null object
age                  3343 non-null object
sex                  3933 non-null object
date_of_birth        3933 non-null object
kurtosis_oxygen      3933 non-null float64
pregnant             3929 non-null object
education-num        3932 non-null float64
relationship         3932 non-null object
skewness_glucose     3931 non-null float64
mean_glucose         3932 non-null float64
std_oxygen           3932 non-null float64
capital-gain         3933 non-null float64
skewness_oxygen      3932 non-null float64
kurtosis_glucose     3932 non-null float64
personal_info        3933 non-null object
education            3932 non-null object
fnlwgt               3933 non-null float64
class                3932 non-null float64
std_glucose          3933 non-null float64
income               3932 non-null object
