# Dataset other_train.csv
#### Anton Rusňák, František Gič

In [408]:
import pandas as pd
import numpy as np
import matplotlib as mat
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
import statistics 
from statistics import mode 
%matplotlib inline

data_other = pd.read_csv("data/other_train.csv",sep=",")

In [409]:
#data_other = data_other.drop(data_other.columns[0],axis=1)


In [410]:
def sanitize_boolean(boolean):
    try:
        if boolean.strip() in ['f','F','FALSE','false','False']:
            return False
        elif boolean.strip() in ['t','T','TRUE','true','True']:
            return True
    except AttributeError:
        return None
    
data_other.pregnant = data_other.pregnant.apply(sanitize_boolean)


In [411]:
def parse_personal_info(personal_info):
    try:    
        return np.array(personal_info.replace(' -- ', ',').replace('|', ',').replace('\r\r\n', ',').split(','))
    except AttributeError:
        return None
    
def remove_empty(arr):
        if arr is not None:
            empty = ['?','??']
            for i in range(0,len(arr)):
                if (arr[i] != None):
                    if arr[i] in empty:
                        arr[i] = None
        return arr
    
def fill_value(number,arr):
    if arr is not None:
        return arr[number]

In [412]:
parsed_info = data_other.personal_info.apply(parse_personal_info).apply(remove_empty)

for index,name in enumerate(['employment','country','relationship_info','employment_info','race']):
    data_other[name] = parsed_info.apply(lambda x: fill_value(index,x))

In [413]:
data_duplicates = data_other[data_other.duplicated(subset='name', keep='first')]
#print(data_duplicates)

In [414]:
 data_other.info()
# data_other.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3983 entries, 0 to 3982
Data columns (total 27 columns):
Unnamed: 0           3983 non-null int64
name                 3983 non-null object
address              3983 non-null object
kurtosis_oxygen      3978 non-null float64
pregnant             3971 non-null object
education-num        3965 non-null float64
relationship         3970 non-null object
skewness_glucose     3960 non-null float64
mean_glucose         3973 non-null float64
std_oxygen           3969 non-null float64
capital-gain         3967 non-null float64
skewness_oxygen      3970 non-null float64
kurtosis_glucose     3965 non-null float64
personal_info        3971 non-null object
education            3971 non-null object
fnlwgt               3970 non-null float64
class                3964 non-null float64
std_glucose          3971 non-null float64
income               3969 non-null object
mean_oxygen          3967 non-null float64
hours-per-week       3970 non-null float64

In [415]:
aggregation_functions = {'name': 'first','address':'first', 'kurtosis_oxygen': 'first','pregnant': 'first','education-num': 'first',
                         'relationship': 'first','skewness_glucose': 'first','mean_glucose': 'first','std_oxygen': 'first',
                         'capital-gain': 'first','skewness_oxygen': 'first','kurtosis_glucose': 'first','personal_info': 'first',
                         'education': 'first','fnlwgt': 'first','class': 'first','std_glucose': 'first',
                         'income': 'first','mean_oxygen': 'first','hours-per-week': 'first','capital-loss': 'first' ,'employment': 'first' 
                         ,'country': 'first' ,'relationship_info': 'first' ,'employment_info': 'first' ,'race': 'first' }

data_other = data_other.groupby(data_other['name']).aggregate(aggregation_functions)

In [416]:
data_duplicates = data_other[data_other.duplicated(subset='name', keep='first')]
print(data_duplicates)

Empty DataFrame
Columns: [name, address, kurtosis_oxygen, pregnant, education-num, relationship, skewness_glucose, mean_glucose, std_oxygen, capital-gain, skewness_oxygen, kurtosis_glucose, personal_info, education, fnlwgt, class, std_glucose, income, mean_oxygen, hours-per-week, capital-loss, employment, country, relationship_info, employment_info, race]
Index: []

[0 rows x 26 columns]


In [417]:
data_personal = pd.read_csv("data/personal_train.csv",sep=",")

In [418]:
data_personal.head()

Unnamed: 0.1,Unnamed: 0,name,address,age,sex,date_of_birth
0,0,Roscoe Bohannon,"7183 Osborne Ways Apt. 651\r\nEast Andrew, OH ...",59.0,Male,1960-07-04
1,1,Ernest Kline,"391 Ball Road Suite 961\r\nFlowersborough, IN ...",47.0,Female,1972-07-20
2,2,Harold Hendriks,"8702 Vincent Square\r\nNew Jerryfurt, CO 30614",59.0,Male,1960-02-28
3,3,Randy Baptiste,"2751 Harris Crossroad\r\nWest Ashley, CA 30311",51.0,Female,1967-12-04
4,4,Anthony Colucci,"904 Robert Cliffs Suite 186\r\nWest Kyle, CO 7...",,Female,1938-04-22


In [419]:
data_personal = data_personal.drop(data_personal.columns[0],axis=1)

In [420]:
data_personal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3933 entries, 0 to 3932
Data columns (total 5 columns):
name             3933 non-null object
address          3933 non-null object
age              3343 non-null object
sex              3933 non-null object
date_of_birth    3933 non-null object
dtypes: object(5)
memory usage: 153.8+ KB


In [421]:
duplicateRowsDF = data_personal[data_personal.duplicated()]
print(duplicateRowsDF)

Empty DataFrame
Columns: [name, address, age, sex, date_of_birth]
Index: []


In [422]:
def reload_data():
    """Reloads the data from CSV file"""
    return pd.read_csv("data/personal_train.csv",sep=",",index_col=0)
    
data_personal = reload_data()


In [423]:
data_personal.index.name = None
data_other.index.name = None

data = pd.merge(data_personal,data_other ,left_on=['name','address'], right_on=['name','address'], how = 'outer')


Upravenie  dátumu na jednotný formát  
Vidíme, podla výpisu nižšie že dátumy majú rôzne formáty

In [424]:
data.date_of_birth.head(50)

0              1960-07-04
1              1972-07-20
2              1960-02-28
3              1967-12-04
4              1938-04-22
5              02/01/1969
6              1967-04-30
7     1957-03-08 00:00:00
8              1980-04-06
9              1986-08-26
10               65-09-15
11    1962-09-17 00:00:00
12             1926-01-19
13             1961-03-23
14               63-05-18
15    1967-03-31 00 00 00
16             1971-12-23
17             1998-05-23
18             1974-04-08
19             1978-10-24
20             1971-03-30
21             1975-09-28
22             1998-01-14
23    1973-11-10 00:00:00
24             1960-08-06
25             1973-04-01
26             1982-08-12
27             1960-07-29
28             1994-11-08
29             1966-06-04
30             1953-07-18
31             1955-12-22
32             1969-05-10
33    1958-04-18 00 00 00
34             1953-07-26
35             1962/12/01
36             1976-11-20
37             1955-12-26
38          

In [425]:
def uprav_datum(data,stlpec,):
    data_date = data.copy()
  
    data_date[stlpec] = data_date[stlpec].apply(str)
    
  #  data_date.loc[~data_date[stlpec].isnull(),stlpec] = data_date.loc[~data_date[stlpec].isnull(),stlpec].str[0:10]
    new_date = []
  
    for i in data_date[stlpec]:
        new_i = ""
        if (str(i).find('n')!=-1):
            new_date.append(i)
            continue
        else:
            if (i[2] == '/' ):
                i = i.replace('/', '-') 
                new_date.append(i)
                continue
            if (i[4] =='/'):
                i = i.replace('/', '-')    
            if (i[2] == '-'):
                new_i = i[6]+i[7]+i[2]+i[3]+i[4]+i[2]+'19'+i[0]+i[1] 
            if (i[4] == '-'):
                new_i = i[8]+i[9]+i[4]+i[5]+i[6]+i[4]+i[0]+i[1]+i[2]+i[3]
            new_date.append(new_i)
    
    data_date[stlpec] = new_date
 
    return new_date

In [426]:
data['date_of_birth']= uprav_datum(data, 'date_of_birth')
data['date_of_birth'].isnull().sum()

0

Vidíme že dátum má už jednotný formát 

In [427]:
data.date_of_birth.head(50)

0     04-07-1960
1     20-07-1972
2     28-02-1960
3     04-12-1967
4     22-04-1938
5     02-01-1969
6     30-04-1967
7     08-03-1957
8     06-04-1980
9     26-08-1986
10    15-09-1965
11    17-09-1962
12    19-01-1926
13    23-03-1961
14    18-05-1963
15    31-03-1967
16    23-12-1971
17    23-05-1998
18    08-04-1974
19    24-10-1978
20    30-03-1971
21    28-09-1975
22    14-01-1998
23    10-11-1973
24    06-08-1960
25    01-04-1973
26    12-08-1982
27    29-07-1960
28    08-11-1994
29    04-06-1966
30    18-07-1953
31    22-12-1955
32    10-05-1969
33    18-04-1958
34    26-07-1953
35    01-12-1962
36    20-11-1976
37    26-12-1955
38    20-08-1945
39    14-11-1961
40    02-02-1964
41    13-02-1961
42    15-08-1974
43    27-06-1978
44    10-08-1959
45    19-01-1971
46    01-01-1972
47    12-08-1983
48    13-11-1984
49    10-01-1971
Name: date_of_birth, dtype: object

# 2 Časť

Dplnenie chýbajúcich hodnôt do stĺpca s vekom.
Na prázdne hodnoty s

In [428]:
data.age

0        59
1        47
2        59
3        51
4       NaN
       ... 
3928     50
3929     52
3930     58
3931    NaN
3932     60
Name: age, Length: 3933, dtype: object

In [429]:
def change(age):
    empty = ['?','??','nan\t','\r\r\n??','nan\r\r\n','??\t',' nan','\r\r\nnan', '\t??','nan ','??\r\r\n','\tnan',' ??',None,'?? ', -1 ]
    if age in empty:
           age = None   
    return age

In [430]:
df = data.age.apply(change)

In [431]:
df.unique()

array(['59', '47', '51', nan, '50', '52', '62', '39', '33', '57', '58',
       None, '45', '40', '48', '44', '21', '46', '37', '24', '66', '63',
       '42', '74', '55', '41', '60', '36', '34', '53', '77', '38', '69',
       '43', '54', '29', '61', '73', '64', '31', '71', '65', '49', '32',
       '70', '27', '68', '9', '67', '56', '81', '35', '78', '25', '30',
       '26', '75', '-1', '23', '82', '79', '72', '16', '18', '76', '99',
       '22', '28', '17', '20', '80', '7', '86', '19', '83', '84', '85',
       '90', '3', '12', '87', '97', '14'], dtype=object)

In [432]:
# df = pd.to_numeric(df)
df = df.fillna(0)

In [433]:
df = df.astype(int)
df

0       59
1       47
2       59
3       51
4        0
        ..
3928    50
3929    52
3930    58
3931     0
3932    60
Name: age, Length: 3933, dtype: int32

In [434]:
data['age'].describe()

count     3343
unique      94
top         53
freq       120
Name: age, dtype: object

Doplnenie veku podla mean - m;6u nasta5 nezhody s d8tumom narodenia

In [435]:
data.age = df
# 
# mean_age = data['age'].mean()
# mean_age = int(mean_age)
# mean_age

In [436]:
# data.info()
# data.loc[data.age <= 0, 'age'] = mean_age

Najdeme najfrekventovanejsi rok po scitani veku a roku narodenia a ten vyuzijeme na dopocitanie veku osob ktore ho nemaju 

In [437]:
def most_frequent(List): 
    num = List[0] 
    for i in List: 
        num = i 
    return num 
  

In [438]:
def najdi_rok(data):
    valid_date_data = []
    person_date = data['date_of_birth'].str[6:10]
    person_age = data['age']
    person_date  =  person_date .astype(int)
    temp = person_age+person_date
    valid_date_data.append(temp)
    return most_frequent(valid_date_data)




In [439]:
valid = najdi_rok(data)

In [440]:
temp = pd.DataFrame(valid)
temp.mode()

Unnamed: 0,0
0,2019


In [441]:
validny_rok=temp.iloc[0][0]
validny_rok

2019

In [442]:
def dopocitanie_vek(data):
        #person_age = data['age']
        person_date = data['date_of_birth'].str[6:10]
        person_date  =  person_date.astype(int)
       # if person_age <=0:
        print('bla')
        temp = validny_rok - person_date
        print(temp)
        data.loc[data.age <= 0, 'age'] = temp
        return data


In [443]:
data = dopocitanie_vek(data)

bla
0       59
1       47
2       59
3       52
4       81
        ..
3928    50
3929    53
3930    59
3931    68
3932    60
Name: date_of_birth, Length: 3933, dtype: int32


In [444]:
data.age.head()

0    59
1    47
2    59
3    51
4    81
Name: age, dtype: int32

In [445]:
data



Unnamed: 0,name,address,age,sex,date_of_birth,kurtosis_oxygen,pregnant,education-num,relationship,skewness_glucose,...,std_glucose,income,mean_oxygen,hours-per-week,capital-loss,employment,country,relationship_info,employment_info,race
0,Roscoe Bohannon,"7183 Osborne Ways Apt. 651\r\nEast Andrew, OH ...",59,Male,04-07-1960,5.190414,False,9.0,Not-in-family,-0.043867,...,48.037897,<=50K,5.914716,40.0,0.0,Transport-moving,United-States,Divorced,Private,White
1,Ernest Kline,"391 Ball Road Suite 961\r\nFlowersborough, IN ...",47,Female,20-07-1972,10.079446,False,9.0,Own-child,0.923553,...,42.581355,<=50K,2.192308,45.0,0.0,Adm-clerical,United-States,Divorced,Private,White
2,Harold Hendriks,"8702 Vincent Square\r\nNew Jerryfurt, CO 30614",59,Male,28-02-1960,9.967118,False,13.0,Husband,0.060398,...,46.838379,>50K,2.127090,40.0,0.0,Prof-specialty,United-States,Married-civ-spouse,Private,White
3,Randy Baptiste,"2751 Harris Crossroad\r\nWest Ashley, CA 30311",51,Female,04-12-1967,12.175754,False,9.0,Wife,-0.583193,...,5297.316538,>50K,1.627090,40.0,0.0,Prof-specialty,United-States,Married-civ-spouse,State-gov,White
4,Anthony Colucci,"904 Robert Cliffs Suite 186\r\nWest Kyle, CO 7...",81,Female,22-04-1938,0.942381,False,10.0,Not-in-family,3.656040,...,44.538595,<=50K,64.813545,16.0,0.0,Tech-support,United-States,Never-married,Private,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3928,Clair Pinkleton,"8582 Lee Roads\r\nCostaburgh, SC 21897",50,Male,27-08-1969,14.217960,False,13.0,Not-in-family,2.375737,...,36.946458,<=50K,1.307692,40.0,0.0,Craft_repair,Dominican-Republic,Married-spouse-absent,Private,Black
3929,William Pope,68430 Erik Terrace Suite 279\r\nPort Adamborou...,52,Female,21-11-1966,14.629993,False,9.0,Wife,0.158678,...,46.378884,<=50K,1.239130,20.0,0.0,,United-States,Married-civ-spouse,,White
3930,Jared Kinsey,Unit 8416 Box 3801\r\nDPO AE 31166,58,Female,29-10-1960,8.715394,False,7.0,Unmarried,-0.475804,...,50.472706,<=50K,2.816890,40.0,0.0,Machine-op-inspct,United-States,Divorced,Private,White
3931,Frederick Lawley,"34420 Brian Stream\r\nLake Michaeltown, MS 01832",68,Female,14-02-1951,-1.283502,False,12.0,Not-in-family,7.695857,...,43.649443,<=50K,178.700669,45.0,0.0,Adm-clerical,United-States,Never-married,Private,White
