In [284]:
import warnings
warnings.filterwarnings('ignore')
%run "KNN_own_implementation.ipynb" 
import pandas as pd
import numpy as np
import numdifftools as nd

#### Get Train and Test data

In [33]:
df_train = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep=',', header=None)
df_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', sep=',', header=None, skiprows=1)
df_train.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "martial-status", "occupation", "relationship",
             "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]
df_test.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "martial-status", "occupation", "relationship",
             "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]

Convert workclass into  categories: 
1. Private
2. self-employed = { “Self-empnot-inc”,“Self-emp-inc”}
3. government-employed = {“Federal-gov”, “Local-gov”, “Stategov”}
4. unemployed = {“Without-pay”, “Never-worked”}

Convert education into these categories:
1. higher-education = {“Doctorate”, “Masters”, “Bachelors”}
2. high-school = {“HS-grad”, “Somecollege”, “Prof-school”, “Assoc-acdm”, “Assoc-voc”}
3. lower-education = {all the Rest}

Add column 'sex'

Add all continouus columns.

Drop all the Rest

In [35]:
def categorize_workclass(x):
    self_employed = ["Self-emp-not-inc", "Self-emp-inc"]
    government_employed = ["Federal-gov", "Local-gov", "State-gov"]
    unemployed = ["Without-pay", "Never-worked"]
    x = x.strip()
    
    if x == 'Private':
        next
    
    if x in self_employed:
        x = 'self-employed'
        
    if x in government_employed:
        x = 'government-employed'
        
    if x in unemployed:
        x = 'unemployed'
    return x

In [36]:
def categorize_education(x):
    higher_education = ["Doctorate", "Masters", "Bachelors"]
    high_school = ["HS-grad", "Some-college", "Prof-school", "Assoc-acdm", "Assoc-voc"]
    x = x.strip()
    
    if x in higher_education:
        x = 'higher_education'
    elif x in high_school:
        x = 'high-school'    
    else:
        x = 'lower_education'
    return x

In [37]:
def encode_class(x):
    x = x.strip()
    if x == '<=50K' or x == '<=50K.':
        x = 0
    else:
        x = 1
    return x

In [38]:
df_train['workclass'] =  df_train['workclass'].apply(lambda x: categorize_workclass(x))
df_test['workclass'] =  df_test['workclass'].apply(lambda x: categorize_workclass(x))
df_train['education'] =  df_train['education'].apply(lambda x: categorize_education(x))
df_test['education'] =  df_test['education'].apply(lambda x: categorize_education(x))

In [39]:
df_train = df_train[df_train.workclass != '?']
df_test = df_test[df_test.workclass != '?']

In [40]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [41]:
# Select relevant categories
y_train = df_train['class']
y_test = df_test['class']

df_train = df_train[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass', 'education', 'sex']]
df_test = df_test[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass', 'education', 'sex']]

#### Encode categorical input with dummies

In [42]:
# Make use of pd.get_dummies --> it encodes every categorical value via one-hot-encoding. 
# This results in 15 instead of 12 columns.
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [43]:
y_train = y_train.apply(lambda x: encode_class(x))
y_test = y_test.apply(lambda x: encode_class(x))

In [44]:
input_space = df_train.values
X_test = df_test.values
input_labels = y_train.values
test_labels = y_test.values

In [283]:
# standardize
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(input_space)
input_space = sc.transform(input_space)
X_test = sc.transform(X_test)

##### Test with knn

In [46]:
 # knn(input_space=input_space, input_labels=input_labels, X_test=X_test, k=3) --> that is some lazy boy!

### Logistic regression

In [47]:
# Sigmoid function to map results between 0-1
def sigmoid(result):
    result = np.exp(result) / (1 + np.exp(result))
    return result

In [48]:
def get_log_likelihood(y_target, beta, input_space):
    # Formel: Sum(yi * Beta * xi - ln(1 + exp(Beta*xi)))
    # TODO: 1-padding? --> No is already in get_coefficients()
    ll = np.sum(y_target * np.dot(input_space, beta) - np.log(1 + np.exp(np.dot(input_space, beta))))
    return ll

In [279]:
def hessian(input_space, beta): 
    # since the hessian matrix for H(Beta) is given as 2nd par. deriv. from l(beta)/dBeta dBeta^T
    # Formula (after deriving): Sum(- (xi^2*e^BetaT*xi) / (1 + e^(BetaT*xi))^2  --> not sure, and since it doesnt work its probably wrong
    nomiator = np.dot(input_space**2, np.exp(np.dot(input_space, beta))) * -1
    denomiator = (1 + np.dot(input_space, beta))**2
    hessian = np.divide(nomiator, denomiator)
    return hessian

In [277]:
def get_coefficients(input_space, y_target, steps, gamma,  beta_init_multiplier=0, method='gradient'):
    # pad input space with 1
    input_space = np.hstack((np.ones((input_space.shape[0],1)), input_space))
    
    # initialize beta-vector with values
    beta = np.ones(input_space.shape[1])*beta_init_multiplier
    
    for i in range(steps):
        continouus_prediction = np.dot(input_space, beta) # weil Y = XB (B = Beta)
        scaled_prediction = sigmoid(continouus_prediction)
        
        error = y_target - scaled_prediction
        
        gradient =  np.dot(input_space.T, error)
        
        if method == 'gradient':
            beta = beta + gamma * gradient
        elif method == 'newton-raphson':
            # Implement update rule here -- still not working!
            hess = hessian(input_space, beta)
            hessian_inverse = np.linalg.inv(hess)
            beta = beta - np.dot(hessian_inverse, gradient)
        else:
            raise('Please specify a correct update rule for beta')
        
    # print('Log-Likelihood: {}'.format(get_log_likelihood(y_target, beta, input_space)))
    
    return beta
        

In [280]:
beta = get_coefficients(input_space, y_target=input_labels,beta_init_multiplier=0 ,steps = 50000, gamma=5e-5, method='gradient')

In [285]:
print(beta)

[-1.37181441  0.58388794  0.04567079  0.82235614  2.38944114  0.27830111
  0.3953425   0.01596662  0.02938879 -0.03254329 -0.25725924  0.0078903
  0.0181743  -0.03485601 -0.28269167  0.28269167]


##### Check result against sklearn

In [282]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=True, C = 1e15) # C value =  get rid of L2-regularization
clf.fit(input_space, input_labels)
print(clf.intercept_, clf.coef_)


[-1.36981997] [[ 0.58388985  0.04567042  0.82236197  2.38952113  0.27830123  0.39534463
   0.01389009  0.02774283 -0.03407509 -0.18141268  0.00788978  0.0181702
  -0.03484997 -0.28269125  0.28269125]]


##### Predict test-data with different beta-initializations and compute average accuracy 

In [286]:
X_test = np.hstack((np.ones((X_test.shape[0],1)), X_test)) # pad test-data

In [481]:
accuracies = []

for i in np.arange(0,1, 0.1):
    beta = get_coefficients(input_space, input_labels, steps=50000, gamma=5e-5, beta_init_multiplier=i)
    
    y_predict = np.dot(X_test, beta)
    y_predict = np.round(sigmoid(y_predict))
    num_correct = len(test_labels) - sum(abs(test_labels - y_predict)) # total number of labels - wrong classified
    accuracy = num_correct / len(test_labels)
    accuracies.append(accuracy)

print("Average accuracy: {}".format(sum(accuracies)/float(len(accuracies))))

Average accuracy: 0.8177307742525135


In [287]:
# TODO:  Newton-Raphson-opt, korrigieren nach lecture