In [23]:
# load dataset
import pandas as pd
import numpy as np
def make_dataset():
    df = pd.read_csv("../data/adult.csv")
    df[df == "?"] = np.nan
    for col in ["workclass", "occupation", "native.country"]:
        df.fillna(df[col].mode(0)[0], inplace=True)
    X = df.drop(["income"], axis=1)
    y = df["income"]
    return X, y

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
def my_train_test_split():
    X, y = make_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
    for feature in categorical:
        le = preprocessing.LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_test[feature] = le.fit_transform(X_test[feature])

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X.columns)

    return X_train, X_test, y_train, y_test

def label_encoder():
    X, y = make_dataset()
    categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
    for feature in categorical:
        le = preprocessing.LabelEncoder()
        X[feature] = le.fit_transform(X[feature])

    # scaler = StandardScaler()
    # X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y

In [24]:
X, y = label_encoder()
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,3,77053,11,9,6,9,1,4,0,0,4356,40,39
1,82,3,132870,11,9,6,3,1,4,0,0,4356,18,39
2,66,3,186061,15,10,6,9,4,2,0,0,4356,40,39
3,54,3,140359,5,4,0,6,4,4,0,0,3900,40,39
4,41,3,264663,15,10,5,10,3,4,0,0,3900,40,39


In [25]:
X, y = make_dataset()
X.sex.value_counts()

sex
Male      21790
Female    10771
Name: count, dtype: int64

In [26]:
print("Original features:\n", list(X.columns), "\n")
data_dummies = pd.get_dummies(X)
print("Features after get_dummies:\n", list(data_dummies.columns))

Original features:
 ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'] 

Features after get_dummies:
 ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Private', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'education_10th', 'education_11th', 'education_12th', 'education_1st-4th', 'education_5th-6th', 'education_7th-8th', 'education_9th', 'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors', 'education_Doctorate', 'education_HS-grad', 'education_Masters', 'education_Preschool', 'education_Prof-school', 'education_Some-college', 'marital.status_Divorced', 'marital.status_Married-AF-spouse', 'marital.status_Married-civ-spouse', 'marital.status_Married-spouse

In [27]:
X = data_dummies.values
X, X.shape

(array([[90, 77053, 9, ..., True, False, False],
        [82, 132870, 9, ..., True, False, False],
        [66, 186061, 10, ..., True, False, False],
        ...,
        [40, 154374, 9, ..., True, False, False],
        [58, 151910, 9, ..., True, False, False],
        [22, 201490, 9, ..., True, False, False]], dtype=object),
 (32561, 107))

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Test score: {:.2f}".format(logreg.score(X_test, y_test)))

Test score: 0.80
