## Problem
Predict whether income exceeds $50K/yr based on census data.

http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names

In [1]:
%config Completer.use_jedi = False
from pandas import read_csv
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np

def convertLabel(x):
    """
    x could be ">50K", "<=50K" or "<=50K." (WTF?)
    """
    return -1 if x.replace(".","").strip() == '<=50K' else 1

columns = [
    "age", "workclass", "fnlwgt", "education", 
    "education_num", "marital_status", "occupation", "relationship",
    "race", "sex", "capital_gain", "capital_loss", "hours_per_week",
    "native_country", "label"
]

train = read_csv("dataset/adult/adult.data.csv", header=None, names=columns, na_values=[" ?"])
test = read_csv("dataset/adult/adult.test.csv", header=None, names=columns, na_values=[" ?"])

# For simplicity, drop any unknown (?) values
train = train.dropna()
test = test.dropna()

# Normalize the label and move it from the dataframe to its own variable
# For some reason, the test set has an extra trailing . in the label
convertLabel = lambda x : 0 if x.strip() == '<=50K' else 1
train['label'] = train['label'].apply(convertLabel)
test['label'] = test['label'].apply(convertLabel)
Y_train = train.pop('label').values
Y_test = test.pop('label').values

In [2]:
print(train.shape)
print(test.shape)

kinds = np.array([dt.kind for dt in train.dtypes])
is_num = kinds != 'O'
all_columns = train.columns.values
num_cols = all_columns[is_num]
cat_cols = all_columns[~is_num]


(30162, 14)
(15060, 14)


## Exploratory Data Analysis

In [3]:
#plt.figure()
#train.age.plot(kind='hist')

In [4]:
print(train.describe())

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  30162.000000  3.016200e+04   30162.000000  30162.000000  30162.000000   
mean      38.437902  1.897938e+05      10.121312   1092.007858     88.372489   
std       13.134665  1.056530e+05       2.549995   7406.346497    404.298370   
min       17.000000  1.376900e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.176272e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.784250e+05      10.000000      0.000000      0.000000   
75%       47.000000  2.376285e+05      13.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    30162.000000  
mean        40.931238  
std         11.979984  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [5]:
#edData = data.loc[:, ['education', 'education_num']]
#index = edData.groupby(['education']).first()
#print(index)

#edData['education'].value_counts().plot(kind='bar')

#data.plot(subplots=True, kind='bar', layout=(2,-1), figsize=(12,12), sharex=False)
#train['occupation'].value_counts().plot(kind='bar')

## Preprocessing

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Convert numeric columns to float type to avoid warning:
# "DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler."
train = train.astype({c: np.float64 for c in num_cols})
test = test.astype({c: np.float64 for c in num_cols})



############ One hot encoding of categorical columns
cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,
                    handle_unknown='ignore'))
cat_steps = [cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

############ Scaling numeric columns


#num_si_step = ('si', SimpleImputer(strategy='median'))

# Standardization of a dataset is a common requirement for many machine learning estimators: 
# they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance).
num_ss_step = ('ss', StandardScaler())
num_steps = [num_ss_step]
num_pipe = Pipeline(num_steps)

transformers = [
    ('cat', cat_pipe, cat_cols),
    ('num', num_pipe, num_cols)
]
ct = ColumnTransformer(transformers=transformers)

#ohe_features = ct.named_transformers_['cat'].named_steps['ohe'].get_feature_names()
#print(ohe_features)
#print(len(ohe_features))

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=100, tol=0.001)
ml_pipe = Pipeline([('transform', ct), ('sgd', clf)])
ml_pipe.fit(train, Y_train)
score = ml_pipe.score(test, Y_test)


"""
X_train = ct.fit_transform(train)
X_test = ct.transform(test)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

print(Y_test)
print(knn.predict(X_test))

"""


'\nX_train = ct.fit_transform(train)\nX_test = ct.transform(test)\n\nfrom sklearn.neighbors import KNeighborsClassifier\nknn = KNeighborsClassifier()\nknn.fit(X_train, Y_train)\n\nprint(Y_test)\nprint(knn.predict(X_test))\n\n'

In [9]:
#cat_feature_names = ml_pipe.named_steps['transform'].named_transformers_['cat'].named_steps['ohe'].get_feature_names()
#print(ml_pipe.named_steps['transform'].named_transformers_['cat'].transform(train))


#for c in all_columns:
#    print(c)
#    print(train[c].value_counts().to_dict())
#    print()

"""
x1 = X[0]
ohe_features = ct.named_transformers_['cat'].named_steps['ohe'].get_feature_names()

for i, f in enumerate(ohe_features):
    if x1[i] != 0:
        print("{}={}".format(f.replace(" ", ""), x1[i]))

print(train.iloc[0])
print(Y_train[0])
"""

print(train)

        age          workclass    fnlwgt      education  education_num  \
0      39.0          State-gov   77516.0      Bachelors           13.0   
1      50.0   Self-emp-not-inc   83311.0      Bachelors           13.0   
2      38.0            Private  215646.0        HS-grad            9.0   
3      53.0            Private  234721.0           11th            7.0   
4      28.0            Private  338409.0      Bachelors           13.0   
5      37.0            Private  284582.0        Masters           14.0   
6      49.0            Private  160187.0            9th            5.0   
7      52.0   Self-emp-not-inc  209642.0        HS-grad            9.0   
8      31.0            Private   45781.0        Masters           14.0   
9      42.0            Private  159449.0      Bachelors           13.0   
10     37.0            Private  280464.0   Some-college           10.0   
11     30.0          State-gov  141297.0      Bachelors           13.0   
12     23.0            Private  122272

## TODO
* train-test-validation split
* model selection
* Graph value_counts() in bar charts for all columns

## References

https://medium.com/dunder-data/from-pandas-to-scikit-learn-a-new-exciting-workflow-e88e2271ef62
https://pandas.pydata.org/pandas-docs/stable/dsintro.html
https://scikit-learn.org/stable/modules/sgd.html#sgd