## General imports

In [77]:
import pandas as pd
import numpy as np

## Load Adult data reconstructed

In [78]:
adultData = pd.read_csv('Datasets/adult_reconstruction.csv')

In [79]:
adultData

Unnamed: 0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation
0,20,40,0,0,Private,Bachelors,13,Married-civ-spouse,Wife,White,Female,United-States,49100,Tech-support
1,40,21,0,0,Private,Some-college,10,Divorced,Own-child,White,Male,United-States,11500,Craft-repair
2,10,17,0,0,Private,11th,7,Never-married,Own-child,White,Male,United-States,2600,Other-service
3,50,51,0,0,Private,HS-grad,9,Married-civ-spouse,Husband,Asian-Pac-Islander,Male,Cambodia,38997,Sales
4,38,28,0,0,Private,Bachelors,13,Never-married,Not-in-family,White,Male,?,41400,Exec-managerial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49526,65,35,0,0,Private,Bachelors,13,Married-civ-spouse,Husband,White,Male,Yugoslavia,85080,Farming-fishing
49527,77,37,3137,0,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Husband,Asian-Pac-Islander,Male,Vietnam,34137,Sales
49528,55,24,0,0,Private,Assoc-voc,11,Never-married,Not-in-family,White,Male,United-States,13016,Sales
49529,40,24,0,0,Private,Some-college,10,Never-married,Not-in-family,White,Female,United-States,15000,Adm-clerical


## Detecting Missing Values

In [80]:
adultData.isin(['?']).sum(axis=0)

hours-per-week       0
age                  0
capital-gain         0
capital-loss         0
workclass         2859
education            0
education-num        0
marital-status       0
relationship         0
race                 0
gender               0
native-country     859
income               0
occupation        2869
dtype: int64

## Cleaning missing data

In [81]:
adultData['native-country'].replace('?', np.nan, inplace=True)
adultData['workclass'].replace('?', np.nan, inplace=True )
adultData['occupation'].replace('?', np.nan, inplace=True )

#dropping the NaN rows now 
adultData.dropna(how='any', inplace=True)
adultData.reset_index(inplace=True, drop=True)
adultData

Unnamed: 0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation
0,20,40,0,0,Private,Bachelors,13,Married-civ-spouse,Wife,White,Female,United-States,49100,Tech-support
1,40,21,0,0,Private,Some-college,10,Divorced,Own-child,White,Male,United-States,11500,Craft-repair
2,10,17,0,0,Private,11th,7,Never-married,Own-child,White,Male,United-States,2600,Other-service
3,50,51,0,0,Private,HS-grad,9,Married-civ-spouse,Husband,Asian-Pac-Islander,Male,Cambodia,38997,Sales
4,38,26,0,1876,Private,Bachelors,13,Never-married,Not-in-family,White,Male,United-States,38524,Exec-managerial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45844,65,35,0,0,Private,Bachelors,13,Married-civ-spouse,Husband,White,Male,Yugoslavia,85080,Farming-fishing
45845,77,37,3137,0,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Husband,Asian-Pac-Islander,Male,Vietnam,34137,Sales
45846,55,24,0,0,Private,Assoc-voc,11,Never-married,Not-in-family,White,Male,United-States,13016,Sales
45847,40,24,0,0,Private,Some-college,10,Never-married,Not-in-family,White,Female,United-States,15000,Adm-clerical


In [82]:
#Renaming some attributes including (hours.per.week, education.num, marital.status and native.country)
adultData.rename(columns={"education-num": "education_num"},inplace=True)
adultData.rename(columns={"hours-per-week": "hoursperweek"},inplace=True)
adultData.rename(columns={"marital-status": "maritalstatus"},inplace=True)
adultData.rename(columns={"native-country": "nativecountry"},inplace=True)

In [83]:
columns_to_keep = ['age', 'workclass', 'education_num', 'maritalstatus', 'nativecountry', 'hoursperweek', 'gender', 'race', 'income']
adultData = adultData[columns_to_keep]

In [84]:
# Combine into 'Married' ('Married-civ-spouse', 'Married-spouse-absent','Married-AF-spouse')
adultData.replace(to_replace =["Married-civ-spouse","Married-spouse-absent","Married-AF-spouse"], value ="Married", inplace=True)
# Combine into 'Not_Married' ('Never-marrie', 'Singl','Divorced','Widowed','Separated')
adultData.replace(to_replace =["Never-married","Single","Divorced","Widowed","Separated"], value ="Not_Married", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [86]:
# binarizing the attributes  
adultData['maritalstatus' ] = adultData['maritalstatus' ].apply( lambda x: 1 if x=='Married' else 0 )
adultData[ 'nativecountry' ] = adultData[ 'nativecountry' ].apply( lambda x: 1 if x=='United-States' else 0 )
adultData ['race'] = adultData ['race'].apply( lambda x: 1 if x == 'White' else 0)
adultData ['gender'] = adultData ['gender'].apply( lambda x: 1 if x == 'Male' else 0)
adultData ['age'] = adultData ['age'].apply( lambda x: 1 if x >= adultData['age'].median() else 0)
adultData ['hoursperweek'] = adultData ['hoursperweek'].apply( lambda x: 1 if x > adultData ['hoursperweek'].median() else 0)
adultData ['education_num'] = adultData ['education_num'].apply( lambda x: 1 if x >= 9  else 0)
adultData['workclass' ] = adultData['workclass' ].apply( lambda x: 1 if x=='Private' else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

## Binarizing the target and generating the different datasets

In [89]:
med = adultData ['income'].median()
adultCopy = adultData.copy()
for th in [10000,int(med),50000]:
    adultData['income'] = adultData['income'].apply(lambda x: 1 if x > th else 0) #float(th) , axis = 1) #label_cutoffs(x,i), axis=1)    
    adultData.to_csv('Datasets/adult_'+ str(th) +'.csv', index=False) 
    adultData ['income'] = adultCopy['income']    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
