In [35]:
%matplotlib inline 

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from scipy.optimize import fmin_cg
from sklearn.metrics import precision_recall_fscore_support

In [4]:
#import data


def load(filename):
    '''
    filenname: string ('test.csv')
    
    returns: DataFrame
    '''
    data = pd.DataFrame.from_csv(filename, encoding = 'utf-8')
    print '\nData Columns:'
    for i in range(len(data.columns)):
        print '\t',i,'\t', data.columns[i]

    check(data)
    
    return data

def check(X):
    '''
    X: DataFrame or Series
    
        prints shape and head
    
    returns: None
    '''
    print '\nShape is:\n', X.shape
    print X.head()

data = load('cs-training.csv')


Data Columns:
	0 	SeriousDlqin2yrs
	1 	RevolvingUtilizationOfUnsecuredLines
	2 	age
	3 	NumberOfTime30-59DaysPastDueNotWorse
	4 	DebtRatio
	5 	MonthlyIncome
	6 	NumberOfOpenCreditLinesAndLoans
	7 	NumberOfTimes90DaysLate
	8 	NumberRealEstateLoansOrLines
	9 	NumberOfTime60-89DaysPastDueNotWorse
	10 	NumberOfDependents

Shape is:
(150000, 11)
   SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines  age  \
1                 1                              0.766127   45   
2                 0                              0.957151   40   
3                 0                              0.658180   38   
4                 0                              0.233810   30   
5                 0                              0.907239   49   

   NumberOfTime30-59DaysPastDueNotWorse  DebtRatio  MonthlyIncome  \
1                                     2   0.802982           9120   
2                                     0   0.121876           2600   
3                                     1   0.085113  

In [5]:
'''
# Shuffle index
index = np.array(data.index)
random.shuffle(index)

print index.shape
print index

# training set (60%)
trainingSize = data.shape[0] * 0.60
training = data.ix[index[:trainingSize]]

training.to_csv('training.csv', encoding='utf-8')

print '\n', training.shape
print training.head()

# crossValidation set (20%)
crossValSize = data.shape[0] * 0.20
crossValidation = data.ix[index[trainingSize:trainingSize + crossValSize]]

crossValidation.to_csv('crossValidation.csv', encoding='utf-8')

print '\n', crossValidation.shape

# test set (20%)
test = data.ix[index[trainingSize + crossValSize:]]

test.to_csv('test.csv', encoding = 'utf-8')

print '\n', crossValidation.shape
'''

"\n# Shuffle index\nindex = np.array(data.index)\nrandom.shuffle(index)\n\nprint index.shape\nprint index\n\n# training set (60%)\ntrainingSize = data.shape[0] * 0.60\ntraining = data.ix[index[:trainingSize]]\n\ntraining.to_csv('training.csv', encoding='utf-8')\n\nprint '\n', training.shape\nprint training.head()\n\n# crossValidation set (20%)\ncrossValSize = data.shape[0] * 0.20\ncrossValidation = data.ix[index[trainingSize:trainingSize + crossValSize]]\n\ncrossValidation.to_csv('crossValidation.csv', encoding='utf-8')\n\nprint '\n', crossValidation.shape\n\n# test set (20%)\ntest = data.ix[index[trainingSize + crossValSize:]]\n\ntest.to_csv('test.csv', encoding = 'utf-8')\n\nprint '\n', crossValidation.shape\n"

In [6]:
# load files

training = load('training.csv')
crossValidation = load('crossValidation.csv')
test = load('test.csv')


Data Columns:
	0 	SeriousDlqin2yrs
	1 	RevolvingUtilizationOfUnsecuredLines
	2 	age
	3 	NumberOfTime30-59DaysPastDueNotWorse
	4 	DebtRatio
	5 	MonthlyIncome
	6 	NumberOfOpenCreditLinesAndLoans
	7 	NumberOfTimes90DaysLate
	8 	NumberRealEstateLoansOrLines
	9 	NumberOfTime60-89DaysPastDueNotWorse
	10 	NumberOfDependents

Shape is:
(90000, 11)
        SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines  age  \
42082                  1                              1.173509   33   
47463                  0                              0.040118   67   
6412                   0                              0.228882   68   
142967                 0                              0.558521   50   
51191                  0                              0.000000   65   

        NumberOfTime30-59DaysPastDueNotWorse  DebtRatio  MonthlyIncome  \
42082                                      3   0.430443           5570   
47463                                      0   0.007237          10500   
6412    

In [7]:
# 0. PROCESS DATA (-> X, y)
    # X
    # y
    # feature scaling

# I. TRAIN (-> theta)
    # theta = fmincg(CostFunction, X, y, lambda, initial_theta)

    # (J, grad) = CostFunction(X, y, lambda)

# II. PREDICT (-> h(x))

    # h = predict(X)

# III. GET ACCURACY & F-SCORE

# 0. PROCESS DATA (-> Xtrain, ytrain)
##Xtrain
##ytrain
##featureScale
##addBias

In [8]:
# 0. PROCESS DATA (-> X, y)
Xtrain = training.iloc[:, 1:]
check(Xtrain)
(m, n) = Xtrain.shape # m: # of examples /n: # of attributes

ytrain = training['SeriousDlqin2yrs']
check(ytrain)

# feature scaling
def featureScale(X):
    '''
    X: DataFrame
    
    returns: DataFrame (feature scaled)
    '''
    for col in range(X.shape[1]):
        #print 'col index:', col
        copied = X.iloc[:, col]
        #print 'head\n', copied.head()
        #print 'mean: ', copied.mean()
        #print 'std: ', copied.std()
        X.iloc[:, col] = (copied - copied.mean())/copied.std()
    return X

Xtrain = featureScale(Xtrain)
check(Xtrain)
     
# Add Bias terms to Xtrain
def addBias(X):
    '''
    X: DataFrame (m * n)
    
        Adds bias terms to the first column
    
    returns: DataFrame (m * (n + 1))
    '''
    X.insert(0, 'Bias', 1)
    return X

Xtrain = addBias(Xtrain)
check(Xtrain)

# fill nan with 0
Xtrain = Xtrain.fillna(0)
ytrain = ytrain.fillna(0)


Shape is:
(90000, 10)
        RevolvingUtilizationOfUnsecuredLines  age  \
42082                               1.173509   33   
47463                               0.040118   67   
6412                                0.228882   68   
142967                              0.558521   50   
51191                               0.000000   65   

        NumberOfTime30-59DaysPastDueNotWorse  DebtRatio  MonthlyIncome  \
42082                                      3   0.430443           5570   
47463                                      0   0.007237          10500   
6412                                       0   0.137878           3872   
142967                                     0   0.211546          12800   
51191                                      0   0.007198           3750   

        NumberOfOpenCreditLinesAndLoans  NumberOfTimes90DaysLate  \
42082                                13                        5   
47463                                17                        0   
6412     

# I. TRAIN (-> theta)
## theta = fmin_cg(CostFunction, X, y, lambda, initial_theta)
## J = costFunction(X, y, lambda)
## grad = gradFunction

In [22]:
# logisticRegression

L = 0.1

def logisticRegression(costfunction, gradFunction, X, y, L):
    '''
    constFunction: function
    X: matrix
    y: vector
    L (lambda): float
    
    returns: vector (optimum theta)
    '''
    initial_theta = np.zeros(n+1)
    args = (X, y, L)
    theta = fmin_cg(f = costFunction, x0 = initial_theta, \
                    fprime = gradFunction, args = args)
    return theta

# costFunction
def costFunction(theta, *args):
    '''
    theta: vector
    X: matrix
    y: vector
    L (lambda): float
    
    returns: float (J)
    '''
    #J = sum(np.multiply(np.log(sigmoid(X.dot(theta))), -y) - \
    #       np.multiply((1 - y),log(1 - sigmoid(X.dot(theta)))))/m 
    theta = theta.reshape((n+1, 1))
    
    X, y, L = args
    y = pd.DataFrame(y)
    y = y.rename(columns = {'SeriousDlqin2yrs':0})
    
    J = np.log(sigmoid(X.dot(theta))).T.dot(-y)
    #print '1',J
    J = J - pd.DataFrame(1 - y).T.dot(np.log(1 - sigmoid(X.dot(theta))))
    #print '2', J
    J = np.sum(J)/m
    #print '3', J
    J = J + (L/(2.0*m)) * np.sum(theta[1:]**2)
    #print '4',J
    
    return np.array(J)

# gradFunction
def gradFunction(theta, *args):
    # grad = transpose(sigmoid(X * y)) * y + lambda * sum(theta[1:])/m
    theta = theta.reshape((n+1, 1))
    
    X, y, L = args
    y = pd.DataFrame(y)
    y = y.rename(columns = {'SeriousDlqin2yrs':0})
    
    grad = X.T.dot(sigmoid(X.dot(theta)) - pd.DataFrame(y))/m
    #print grad.shape
    
    # Regularize
    theta = (L/m)*theta
    theta[0] = 0
    #print theta.shape
    #print theta
    
    grad = np.array(grad + theta)
    grad = grad.reshape(n+1)
    
    return grad

# sigmoid
def sigmoid(z):
    '''
    x: float
    
    returns: float
    '''
    return 1.0/(1.0 + np.exp(-z))



In [23]:
#theta = logisticRegresssion()
theta = np.zeros((n+1, 1))
args = (Xtrain, ytrain, L)
J = costFunction(theta, *args)
print J

[ 0.69314718]


In [25]:
# grad
args = (Xtrain, ytrain, L)
grad = gradFunction(theta, *args)
print grad

[  4.33655556e-01   6.25321947e-05   2.90906540e-02  -3.09406426e-02
   2.10450005e-03   4.23606549e-03   6.87684072e-03  -2.89193348e-02
   1.38170311e-03  -2.54732107e-02  -1.14356757e-02]


In [27]:
# Logistic Regression
#initial_theta = np.zeros((n+1, 1))
theta = logisticRegression(costFunction, gradFunction, Xtrain, ytrain, L)
print theta

Optimization terminated successfully.
         Current function value: 0.224363
         Iterations: 76
         Function evaluations: 202
         Gradient evaluations: 202
[ -2.86594404e+00  -4.92748843e-04  -4.36030576e-01   2.11617687e+00
  -2.11767114e-02  -4.17048557e-01  -3.35876530e-02   1.91248129e+00
   8.05037708e-02  -3.87176394e+00   1.03947099e-01]


# II. PREDICT (-> h(x))

    h = predict(X)
    
# III. GET ACCURACY & F-SCORE

In [48]:
def predict(theta, X, y):
    '''
    theta
    X
    y
    L
    
        prints measures (accuracy, precision, recall, F-score)
    
    returns: prediction(vector) 
            
    '''
    # get prediction
    prediction = getPrediction(theta, X)
    
    # get measures
    measures = getMeasures(y, prediction)
    accuracy, precision, recall, fscore = measures
    
    print 'accuracy', accuracy
    print 'precision', precision
    print 'recall', recall
    print 'fscore', fscore
    
    return prediction

def getPrediction(theta, X):
    '''
    
    returns: vector of labels (0, 1)
    '''
    probability = sigmoid(X.dot(theta))
    prediction = probability > 0.5
    
    return prediction * 1

def getMeasures(y, prediction):
    '''
    
    returns: measures
    '''
    accuracy = (prediction == y).mean()
    
    precision, recall, fbeta_score, support = \
        precision_recall_fscore_support(y, prediction, average = 'binary')
    
    measures = (accuracy, precision, recall, fbeta_score)
    return measures

In [50]:
prediction = predict(theta, Xtrain, ytrain)
print '\n',prediction

accuracy 0.934222222222
precision 0.563909774436
recall 0.0376821302964
fscore 0.0706436420722

42082     0
47463     0
6412      0
142967    0
51191     0
101607    0
78080     0
71133     0
92953     0
57691     0
14734     0
101685    0
46801     0
88538     0
12670     0
114441    0
114188    0
76914     0
26251     0
58124     0
6651      0
55682     0
47055     0
100960    0
13631     0
124744    0
1899      0
146194    0
128135    0
140286    0
         ..
125089    0
51635     0
23180     0
61871     0
100222    0
57165     0
19815     0
75303     0
108353    0
10393     0
75396     0
96572     0
46548     0
75003     0
62957     0
130338    0
48373     0
149725    0
147352    0
111956    0
68213     0
22456     0
2267      0
109663    0
129436    0
36678     0
35519     0
57500     0
129582    0
90126     0
dtype: int64


#LEARNING CURVE