In [20]:
import numpy as np
import pandas as pd

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                'income']

dataset = pd.read_csv(url, names = column_names, skipinitialspace=True)

dataset.head()

URLError: <urlopen error [Errno 111] Connection refused>

In [None]:
print(dataset['income'].value_counts())

In [None]:
# Check the number of missing values in each Series
dataset.isnull().sum().sort_values(ascending=False)

In [None]:
#Check the data type of each column
dataset.dtypes

In [None]:
#Lots of objects, Need to convert
#First, check the categories of the series that are objects

for col in dataset.columns:
    if dataset[col].dtypes == 'object':
        num_of_categories = len(dataset[col].unique())
        print("{col} has {num_of_categories} categories".format(col=col, num_of_categories=num_of_categories))

#Depending on your dataset, you can add a line or two more of code to convert all categories with frequencies less
#than, say 35% of the max frequency for that column to 'other'

In [None]:
#Break up data set into X and y
X = dataset.drop('income', 1)
y = dataset['income']

#Convert income values to binary
#Option 1: Use 'map' method or the 'get_dummies' method
#Or a one-liner: y = [0 if x == '<=50K' else 1 for x in y]

y = y.map({'<=50K':0, '>50K':1})

In [None]:
print(X['education'].value_counts().sort_values(ascending=False))

In [None]:
#Now, to work on X
#We've seen the number of categories in each (feature)series
#Let's check the distribution within the series (using education and native_country as example)

print(X['native-country'].value_counts().sort_values(ascending=False).head(10))


In [None]:
'''
#This will loop through the columns and set categories with freqiencies below a threshold to 'Other'
#But this is not what I want...I want only native_country changed
threshold = 50  # Remove items less than or equal to threshold
for col in X.columns:
    freq = X[col].value_counts()
    vals_to_remove = freq[freq <= threshold].index.values
    X[col].loc[X[col].isin(vals_to_remove)] = 'Other'
    

#TODO: Try to set threshold to x% of max frequency instead of a rigid number. Line below is not working
#threshold = 0.35*(freq.iloc[1].max)

for col in X.columns:
    freq = X[col].value_counts()
    threshold = 100
    to_change = freq[freq <= threshold].index
    X[col].replace(to_change, np.nan, inplace=True)
'''

In [None]:
# For now, besides United-States, set all other categories in native-countries to "Other"
X['native-country'] = ['United-States ' if x == 'United-States' else 'Other' for x in X['native-country']]

print(X['native-country'].value_counts().sort_values(ascending=False))

In [None]:
X.head()

In [None]:
#To convert all features with categories into numeric features, create a function using get_dummies

# A list of categorical features
cat_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                'native-country']

# Convert cat_features to numeric
def cat_to_num(dataset, cat_faetures):
    for f in cat_features:
        new_feats = pd.get_dummies(dataset[f], prefix=f, dummy_na=False)
        dataset = dataset.drop(f, 1)
        dataset = pd.concat([dataset, new_feats], axis=1)
    return dataset


X = cat_to_num(X, cat_features)
X.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Plot histograms to show distribution of features against outcomes
def plot_histogram(x,y):
    plt.hist(list(x[y==0]), alpha=0.5, label='Outcome=0')
    plt.hist(list(x[y==1]), alpha=0.5, label='Outcome=1')
    plt.title("Histogram of '{var_name}' by Outcome Category".format(var_name=x.name))
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.legend(loc='upper right')
    plt.show()
    
plot_histogram(X['age'], y)

In [None]:

# Spot Check a couple of Algorithms
import pandas
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

X = X.values
y = y.values

# prepare configuration for cross validation test harness
num_folds = 10
num_instances = len(X)
seed = 1
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('GBM', GradientBoostingClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    scores = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(scores)

In [None]:
'''
from sklearn.cross_validation import train_test_split

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#Use the model with the best score from above
#Instantiate the model
model = XGBClassifier()

#Parameter Tuning
learning_rate = [0.0001, 0.001, 0.0015, 0.01, 0.015, 0.1]
n_estimators = [100, 200, 300, 400, 500, 600, 700]
subsample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample)
kfold = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=7)
rand_search = RandomizedSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold)
result = rand_search.fit(X, y)

print("Model Report")
print("Best Score: %f based on %s" % (result.best_score_, result.best_params_))

param_grid = [
 {'C': [0, 0.1, 1, 10, 100], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf', 'linear']},
 {'C': [0, 0.1, 1, 10, 100], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['poly'], 'degree': [1, 2, 3, 4, 5]}
]

'''

In [None]:
import time
start_time = time.time()

from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics   
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV


# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit(X_train).transform(X_train)
X_test = scaler.fit(X_test).transform(X_test)

#Use the model with the best score from above
#Instantiate the model
estimator = SVC()

#Parameter Tuning

#param_grid = [{'c': [0, 0.1, 1, 10, 100], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf', 'linear']}]
param_grid = [
 {'C': [0.1, 1, 10, 100], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf', 'linear']},
 {'C': [0.1, 1, 10, 100], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['poly'], 'degree': [1, 2, 3, 4, 5]}
]

kfold = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(estimator, param_grid, scoring="accuracy", cv=kfold, n_jobs=-1)
result = grid_search.fit(X_train, y_train)

print("Model Report")
print("Best Score: %f based on %s" % (result.best_score_, result.best_params_))
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#Now, use test data to check the accuracy of model with the best parameters from above

