In [1]:
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression

In [2]:
# Read the data in to a DataFrame
data = pd.read_csv("./train.csv")
print(data.head())


   ACTION  RESOURCE  MGR_ID  ROLE_ROLLUP_1  ROLE_ROLLUP_2  ROLE_DEPTNAME  \
0       1     39353   85475         117961         118300         123472   
1       1     17183    1540         117961         118343         123125   
2       1     36724   14457         118219         118220         117884   
3       1     36135    5396         117961         118343         119993   
4       1     42680    5905         117929         117930         119569   

   ROLE_TITLE  ROLE_FAMILY_DESC  ROLE_FAMILY  ROLE_CODE  
0      117905            117906       290919     117908  
1      118536            118536       308574     118539  
2      117879            267952        19721     117880  
3      118321            240983       290919     118322  
4      119323            123932        19793     119325  


In [3]:
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [4]:
model = LogisticRegression()
model.fit(X,y)
print("Logistic Regression Score:")
print(model.score(X,y))

Logistic Regression Score:
0.942109920962


In [5]:
# What is the null error rate
print("Null Error Rate")
print(y.mean())

Null Error Rate
0.942109920962


In this scenario, the logistic regression is simply guessing 1 for every datapoint. 94% of the time the action was 1. Thus, the LR can be right 94% of the time by guessing 1. Can we beat this?

In [6]:
# Lets try doing each column.
for column in X.columns.values:
    X_one = X.loc[:,column].reshape(-1,1)
    model = LogisticRegression()
    model.fit(X_one, y)
    print("Score for " + column + ":")
    print(model.score(X_one,y))

Score for RESOURCE:
0.942109920962
Score for MGR_ID:
0.942109920962
Score for ROLE_ROLLUP_1:
0.942109920962
Score for ROLE_ROLLUP_2:
0.942109920962
Score for ROLE_DEPTNAME:
0.942109920962
Score for ROLE_TITLE:
0.942109920962
Score for ROLE_FAMILY_DESC:
0.942109920962
Score for ROLE_FAMILY:
0.942109920962
Score for ROLE_CODE:
0.942109920962


Using any one feature yielded the same results. Lets try a combination of features.

In [None]:
for column1 in X.columns.values:
    for column2 in X.columns.values:
        if(column1 == column2):
            continue
        else:
            X_two = X.loc[:,[column1, column2]]
            model = LogisticRegression()
            model.fit(X_two,y)
            score = model.score(X_two,y)
            if(score > y.mean()):
                print("Logistic Regression for: " + str(X_two.columns.values))
                print(score)
            

In [None]:
for column1 in X.columns.values:
    for column2 in X.columns.values:
        for column3 in X.columns.values:
            if(column1 == column2):
                continue;
            if(column1 == column3):
                continue
            if(column2 == column3):
                continue;
            else:
                X_three = X.loc[:,[column1, column2,column3]]
                model = LogisticRegression()
                model.fit(X_three,y)
                score = model.score(X_three,y)
                if(score > y.mean()):
                    print("Logistic Regression for: " + str(X_three.columns.values))
                    print(score)
            

In [None]:
for column1 in X.columns.values:
    print(column1)
    for column2 in X.columns.values:
        for column3 in X.columns.values:
            for column4 in X.columns.values:
                if(column1 == column2):
                    continue;
                if(column1 == column3):
                    continue
                if(column1 == column4):
                    continue
                if(column2 == column3):
                    continue;
                if(column2 == column4):
                    continue
                if(column3 == column4):
                    continue
                else:
                    X_four = X.loc[:,[column1, column2,column3, column4]]
                    model = LogisticRegression()
                    model.fit(X_four,y)
                    score = model.score(X_four,y)
                    if(score > y.mean()):
                        print("Logistic Regression for: " + str(X_four.columns.values))
                        print(score)


No combination of features, up to 4 featues is generating a logistic regression model that is better than predicting 1 for all entries. This code is expensive beecause it runs through each possible combination, with duplicates. I could have done it in 1/2 the time.

In [None]:
X_one_hot = X
for column in X.columns.values:
    if column == "ROLE_CODE":
        continue
    X_dummies = pd.get_dummies(X.loc[:,"RESOURCE"])
    print(X_dummies.head())
    X_one_hot = pd.concat((X_one_hot,X_dummies), axis=1)
    print("Done wit: " + column)

In [None]:
model = LogisticRegression()
model.fit(X_one_hot,y)
print(model.score(X_one_hot,y))

In [8]:
for column in X.columns.values:
    print(column + "\t" + str(len(X.loc[:,column].unique())) + "/" + str(len(X.loc[:,column])))
    print(len(X.loc[:,column].unique()) / float(len(X.loc[:,column])))

RESOURCE	7518/32769
0.229424150874
MGR_ID	4243/32769
0.129482132503
ROLE_ROLLUP_1	128/32769
0.00390613079435
ROLE_ROLLUP_2	177/32769
0.00540144648906
ROLE_DEPTNAME	449/32769
0.013701974427
ROLE_TITLE	343/32769
0.010467209863
ROLE_FAMILY_DESC	2358/32769
0.0719582532271
ROLE_FAMILY	67/32769
0.00204461533767
