# Amazon Employee Access Challenge

## About The Data

The data consists of real historical data collected from 2010 & 2011.  Employees are manually allowed or denied access to resources over time. We will try to create a logistic regression model capable of learning from this historical data to predict approval/denial for an unseen set of employees.

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse


In [3]:
train_data= pd.read_csv('../../data/original data/amazon-employee-access-challenge/train.csv')
test_data = pd.read_csv('../../data/original data/amazon-employee-access-challenge/test.csv')

In [6]:
def load_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


In [13]:
def preprocess_data(train_df, test_df):
    y = train_df['ACTION'].values
    X = pd.concat([train_df.drop('ACTION', axis=1), test_df.drop('id', axis=1)], axis=0)
    return X, y, len(train_df)


In [None]:
def encode_features(X):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
    X_encoded = encoder.fit_transform(X)
    return X_encoded, encoder


In [15]:
# Convert X_encoded to a dense array before using it in cross-validation
X_encoded_dense = X_encoded.toarray()

# Check the dimensions to make sure they align
print(f"X_encoded_dense shape: {X_encoded_dense.shape}")
print(f"y shape: {y.shape}")

# Set up cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
aucs = []

# Perform cross-validation manually
for train_idx, val_idx in kf.split(X_encoded_dense):
    # Ensure that the indices are within the correct bounds
    model.fit(X_encoded_dense[train_idx], y[train_idx])
    preds = model.predict_proba(X_encoded_dense[val_idx])[:, 1]
    auc = roc_auc_score(y[val_idx], preds)
    aucs.append(auc)

# Compute the mean AUC
mean_auc = np.mean(aucs)
print('Mean AUC:', mean_auc)


X_encoded_dense shape: (91690, 16961)
y shape: (32769,)


IndexError: index 32769 is out of bounds for axis 0 with size 32769

In [23]:
# Train the model
model.fit(X_encoded[:n_train], y)

# Predict probabilities on the test set
preds = model.predict_proba(X_encoded[n_train:])[:, 1]

# Apply threshold of 0.5 to convert probabilities into class labels
pred_class = (preds > 0.5).astype(int)


In [17]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'id': np.arange(1, len(pred_class) + 1),
    'ACTION': pred_class
})

# Save it to CSV
submission.to_csv('submission.csv', index=False)

print("Saved predictions to submission.csv")

Saved predictions to submission.csv


In [18]:
def main(train_path, test_path, output_path):
    train_df, test_df = load_data(train_path, test_path)
    X_combined, y, num_train = preprocess_data(train_df, test_df)
    X_encoded, encoder = encode_features(X_combined)

    X_train = X_encoded[:num_train]
    X_test = X_encoded[num_train:]

    model = LogisticRegression(C=1.0, solver='liblinear', max_iter=1000)
    mean_auc = cross_validate_model(X_train, y, model)
    print(f"Cross-validated AUC: {mean_auc:.4f}")

    preds = train_and_predict(X_train, y, X_test, model)
    save_predictions(preds, output_path)


In [24]:
main('../Project/amazon-employee-access-challenge/train.csv', '../Project/amazon-employee-access-challenge/test.csv', "submission.csv")


Cross-validated AUC: 0.8646
Saved predictions to submission.csv


### Adding some features

In [49]:
from numpy import array
from sklearn import metrics, linear_model
from sklearn.model_selection import KFold
from scipy import sparse
from itertools import combinations

import numpy as np
import pandas as pd
import sys
import random
import time

In [50]:
def dict_encode(encoding, value):
    if value not in encoding:
        encoding[value] = {'code': len(encoding) + 1, 'count': 0}
    enc = encoding[value]
    enc['count'] += 1
    encoding[value] = enc

def dict_decode(encoding, value, min_occurs):
    enc = encoding.get(value, {'code': -1, 'count': 0})
    return enc['code'] if enc['count'] >= min_occurs else -1


In [51]:
def group_data(data, degree, min_occurs):
    m, n = data.shape
    encoding = dict()
    for indexes in combinations(range(n), degree):
        for v in data[:, indexes]:
            dict_encode(encoding, tuple(v))
    new_data = []
    for indexes in combinations(range(n), degree):
        new_data.append([dict_decode(encoding, tuple(v), min_occurs) for v in data[:, indexes]])
    return array(new_data).T

def one_hot_encoder(data, keymap=None):
    if keymap is None:
        keymap = []
        for col in data.T:
            uniques = set(list(col))
            keymap.append(dict((key, i) for i, key in enumerate(uniques)))
    total_pts = data.shape[0]
    outdat = []
    for i, col in enumerate(data.T):
        km = keymap[i]
        num_labels = len(km)
        spmat = sparse.lil_matrix((total_pts, num_labels))
        for j, val in enumerate(col):
            if val in km:
                spmat[j, km[val]] = 1
        outdat.append(spmat)
    outdat = sparse.hstack(outdat).tocsr()
    return outdat, keymap


In [52]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score


def create_test_submission(filename, prediction):
    content = ['id,ACTION']
    for i, p in enumerate(prediction):
        content.append('%i,%f' % (i + 1, p))
    with open(filename, 'w') as f:
        f.write('\n'.join(content))
    print('Saved')

def cv_loop(X, y, model, N, seed):
    mean_auc = 0.0
    k_fold = KFold(n_splits=N, shuffle=True, random_state=seed)
    
    for train_ix, test_ix in k_fold.split(X):
        model.fit(X[train_ix], y[train_ix])
        preds = model.predict_proba(X[test_ix])[:, 1]
        auc = roc_auc_score(y[test_ix], preds)
        mean_auc += auc
    
    return mean_auc / N


In [53]:
def main(train, test, submit, seed, min_occurs, good_features):
    start_time = time.time()
    print("Reading train and test datasets...")
    train_data = pd.read_csv(train)
    test_data = pd.read_csv(test)

    all_data = np.vstack((train_data.iloc[:, 1:], test_data.iloc[:, 1:]))
    num_train = train_data.shape[0]

    print("Transforming data with feature combinations...")
    degrees = [2, 3, 4, 5, 6, 7]
    grouped_data = [group_data(all_data, degree=d, min_occurs=min_occurs) for d in degrees]

    y = array(train_data.ACTION)
    X_train_all = np.hstack([all_data[:num_train]] + [d[:num_train] for d in grouped_data])
    X_test_all = np.hstack([all_data[num_train:]] + [d[num_train:] for d in grouped_data])

    num_features = X_train_all.shape[1]
    print("Total number of categorical features %i" % num_features)

    rnd = random.Random(seed * num_features)
    model = linear_model.LogisticRegression()
    model.C = 0.5 + rnd.random() * 3.5
    print("Logistic C parameter: %f" % model.C)

    Xts = [one_hot_encoder(X_train_all[:, [i]])[0] for i in range(num_features)]

    print("Starting greedy feature selection...")
    N = 10
    if good_features is None:
        score_hist = []
        good_features = set()
        f_remain = list(range(len(Xts)))
        cur_best_score = -1
        cur_best_score_thres = 1.0

        while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
            scores = []
            f_shuff = f_remain[:]
            rnd.shuffle(f_shuff)
            n_thres = 0.3679 * len(f_remain)
            iter_best_score = -1
            for i, f in enumerate(f_shuff):
                feats = list(good_features) + [f]
                Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
                score = cv_loop(Xt, y, model, N, seed)
                if score < (cur_best_score * cur_best_score_thres):
                    f_remain.remove(f)
                else:
                    scores.append((score, f))
                    if score > iter_best_score:
                        iter_best_score = score
                        if i > n_thres and score > cur_best_score:
                            break
            if scores:
                best_score = sorted(scores)[-1]
                f_remain.remove(best_score[1])
                if best_score[0] > cur_best_score:
                    good_features.add(best_score[1])
                    score_hist.append(best_score)
                    cur_best_score = best_score[0]
                print(f"Current features: {list(good_features)} AUC: {best_score[0]}")
            else:
                break

    good_features = sorted(list(good_features))
    print("Selected features %s" % good_features)

    print("Hyperparameter tuning...")
    Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
    score_hist = []
    score = cv_loop(Xt, y, model, N, seed)
    score_hist.append((score, model.C))

    Cvals = np.logspace(-3, 4, 20, base=2)
    for C in Cvals:
        model.C = C
        score = cv_loop(Xt, y, model, N, seed)
        score_hist.append((score, C))
        print(f"C: {C:.4f} | Mean AUC: {score:.5f}")

    model.C = sorted(score_hist)[-1][1]
    print("Best C: %.4f" % model.C)

    print("Final One Hot Encoding...")
    Xt = np.vstack((X_train_all[:, good_features], X_test_all[:, good_features]))
    Xt, keymap = one_hot_encoder(Xt)
    X_train = Xt[:num_train]
    X_test = Xt[num_train:]

    print("Training final model...")
    model.fit(X_train, y)

    print("Predicting and saving results...")
    preds = model.predict_proba(X_test)[:, 1]
    create_test_submission(submit, preds)

    print("Total execution time: %.2f minutes" % ((time.time() - start_time) / 60.0))


In [54]:
args = {
    'train': '../Project/amazon-employee-access-challenge/train.csv',
    'test': '../Project/amazon-employee-access-challenge/test.csv',
    'submit': 'newFeaturesPredictions.csv',
    'seed': 123,
    'min_occurs': 3,
    'good_features': None
}

main(**args)


Reading train and test datasets...
Transforming data with feature combinations...
Total number of categorical features 501
Logistic C parameter: 3.343445
Starting greedy feature selection...
Current features: [440] AUC: 0.8481115171728296
Current features: [440, 0] AUC: 0.876280420347477


KeyboardInterrupt: 