In [1]:
# Import lots of tools
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Problem Set 4
**Qimo Li and Ran Dou talked and assisted with each other, and submitted the assignment individually.**

This problem is a basic test on a real data set of some of our classifiers. We will define a core test as a 50 length monte-carlo run of a train/test split.  
The data set represents information on credit card account, and whether the customers default or not. There are several fields and descriptions are linked here. However at the moment these are not necessary.  
The notebook tries to find the datafile in your current directory with the name defaultBal.csv. You can do what you want, but make sure this all lines up in the notebook (read_csv).  
The notebook gives an example with a Naive Bayes rule. It shows how to run a 50 length monte-carlo on varying train/test sets, and stores train/test accuracy measures. This is your basic framework to use. 

In [2]:
credit = pd.read_csv("/Users/liqimo/desktop/defaultBal.csv")
y = credit.default
Xall= credit.values[:,1:24]
X = Xall.copy()
print(X.shape)
print(np.mean(y))
yGuess = np.mean(y)
credit.head()

(12450, 23)
0.5330923694779116


Unnamed: 0,id,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,6,50000,1,1,2,37,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
4,9,140000,2,3,1,28,0,0,2,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0


### Naive Bayes example

In [3]:
# Try a naive Bayes classifier
gnb = GaussianNB()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
trainFit = gnb.fit(X_train,y_train)
print("The score on the train data set is {:.4f}".format(trainFit.score(X_train,y_train)))
print("The score on the test data set is {:.4f}".format(trainFit.score(X_test, y_test)))

The score on the train data set is 0.5661
The score on the test data set is 0.5535


In [4]:
# start monte-carlo for GaussianNB()
nmc = 50
trainScore = np.zeros(nmc)
testScore  = np.zeros(nmc)
gnb = GaussianNB()
for i in range(nmc):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
    trainFit = gnb.fit(X_train,y_train)
    trainScore[i] = trainFit.score(X_train,y_train)
    testScore[i] =  trainFit.score(X_test,y_test)
print("The mean of the train data set is {:.4f}".format(np.mean(trainScore)))
print("The standard deviation of the train data set is {:.4f}".format(np.std(trainScore)))
print("The mean of the test data set is {:.4f}".format(np.mean(testScore)))
print("The standard deviation of the test data set is {:.4f}".format(np.std(testScore)))
print("The probability of beating a benchmark classifier is {}".format(np.mean(testScore>yGuess)))

The mean of the train data set is 0.5668
The standard deviation of the train data set is 0.0036
The mean of the test data set is 0.5638
The standard deviation of the test data set is 0.0091
The probability of beating a benchmark classifier is 1.0


### 1. Repeat the monte-carlo with a LinearDiscriminant classifier. Report means, std’s, and probability of beating a benchmark classifier which guesses 1 all the time.

In [5]:
nmc = 50
trainScore = np.zeros(nmc)
testScore = np.zeros(nmc)
LDA = LinearDiscriminantAnalysis()
for i in range(nmc):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
    trainFit = LDA.fit(X_train,y_train)
    trainScore[i] = trainFit.score(X_train,y_train)
    testScore[i] =  trainFit.score(X_test,y_test)
print("The mean of the train data set is {:.4f}".format(np.mean(trainScore)))
print("The standard deviation of the train data set is {:.4f}".format(np.std(trainScore)))
print("The mean of the test data set is {:.4f}".format(np.mean(testScore)))
print("The standard deviation of the test data set is {:.4f}".format(np.std(testScore)))
print("The probability of beating a benchmark classifier is {}".format(np.mean(testScore>yGuess)))

The mean of the train data set is 0.6518
The standard deviation of the train data set is 0.0035
The mean of the test data set is 0.6520
The standard deviation of the test data set is 0.0080
The probability of beating a benchmark classifier is 1.0


### 2. Repeat this with Logistic regression for C=100, C=1, C=0.01.

In [6]:
nmc = 50
trainScore = np.zeros(nmc)
testScore = np.zeros(nmc)
for c in [100, 1, 0.01]:
    LR = LogisticRegression(C = c)
    for i in range(nmc):
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
        trainFit = LR.fit(X_train,y_train)
        trainScore[i] = trainFit.score(X_train,y_train)
        testScore[i] =  trainFit.score(X_test,y_test)
    print("When C = " + str(c) + ":")
    print("The mean of the train data set is {:.4f}".format(np.mean(trainScore)))
    print("The standard deviation of the train data set is {:.4f}".format(np.std(trainScore)))
    print("The mean of the test data set is {:.4f}".format(np.mean(testScore)))
    print("The standard deviation of the test data set is {:.4f}".format(np.std(testScore)))
    print("The probability of beating a benchmark classifier is {}".format(np.mean(testScore>yGuess)))

When C = 100:
The mean of the train data set is 0.6315
The standard deviation of the train data set is 0.0064
The mean of the test data set is 0.6303
The standard deviation of the test data set is 0.0100
The probability of beating a benchmark classifier is 1.0
When C = 1:
The mean of the train data set is 0.6310
The standard deviation of the train data set is 0.0070
The mean of the test data set is 0.6305
The standard deviation of the test data set is 0.0109
The probability of beating a benchmark classifier is 1.0
When C = 0.01:
The mean of the train data set is 0.6328
The standard deviation of the train data set is 0.0077
The mean of the test data set is 0.6311
The standard deviation of the test data set is 0.0093
The probability of beating a benchmark classifier is 1.0


### 3. Repeat this for a LinearSVC with C = 100, C=1, C=0.01

In [7]:
nmc = 50
trainScore = np.zeros(nmc)
testScore = np.zeros(nmc)
for c in [100, 1, 0.01]:
    LSVC = LinearSVC(C = c)
    for i in range(nmc):
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
        trainFit = LSVC.fit(X_train,y_train)
        trainScore[i] = trainFit.score(X_train,y_train)
        testScore[i] =  trainFit.score(X_test,y_test)
    print("When C = " + str(c) + ":")
    print("The mean of the train data set is {:.4f}".format(np.mean(trainScore)))
    print("The standard deviation of the train data set is {:.4f}".format(np.std(trainScore)))
    print("The mean of the test data set is {:.4f}".format(np.mean(testScore)))
    print("The standard deviation of the test data set is {:.4f}".format(np.std(testScore)))
    print("The probability of beating a benchmark classifier is {}".format(np.mean(testScore>yGuess)))

When C = 100:
The mean of the train data set is 0.5431
The standard deviation of the train data set is 0.0381
The mean of the test data set is 0.5426
The standard deviation of the test data set is 0.0367
The probability of beating a benchmark classifier is 0.68
When C = 1:
The mean of the train data set is 0.5362
The standard deviation of the train data set is 0.0399
The mean of the test data set is 0.5338
The standard deviation of the test data set is 0.0404
The probability of beating a benchmark classifier is 0.56
When C = 0.01:
The mean of the train data set is 0.5418
The standard deviation of the train data set is 0.0333
The mean of the test data set is 0.5423
The standard deviation of the test data set is 0.0347
The probability of beating a benchmark classifier is 0.64


### 4. Repeat this for KNeighborClassifier for n_neighbors = 3, 11, 25

In [8]:
nmc = 50
trainScore = np.zeros(nmc)
testScore = np.zeros(nmc)
for n in [3, 11, 25]:
    KNC = KNeighborsClassifier(n_neighbors = n)
    for i in range(nmc):
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
        trainFit = KNC.fit(X_train,y_train)
        trainScore[i] = trainFit.score(X_train,y_train)
        testScore[i] =  trainFit.score(X_test,y_test)
    print("When n_neighbors = " + str(n) + ":")
    print("The mean of the train data set is {:.4f}".format(np.mean(trainScore)))
    print("The standard deviation of the train data set is {:.4f}".format(np.std(trainScore)))
    print("The mean of the test data set is {:.4f}".format(np.mean(testScore)))
    print("The standard deviation of the test data set is {:.4f}".format(np.std(testScore)))
    print("The probability of beating a benchmark classifier is {}".format(np.mean(testScore>yGuess)))

When n_neighbors = 3:
The mean of the train data set is 0.7786
The standard deviation of the train data set is 0.0029
The mean of the test data set is 0.5698
The standard deviation of the test data set is 0.0068
The probability of beating a benchmark classifier is 1.0
When n_neighbors = 11:
The mean of the train data set is 0.6805
The standard deviation of the train data set is 0.0035
The mean of the test data set is 0.6001
The standard deviation of the test data set is 0.0092
The probability of beating a benchmark classifier is 1.0
When n_neighbors = 25:
The mean of the train data set is 0.6503
The standard deviation of the train data set is 0.0035
The mean of the test data set is 0.6102
The standard deviation of the test data set is 0.0080
The probability of beating a benchmark classifier is 1.0


### 5. Repeat this for a Decision Tree with max_depth = 5, 10, 25

In [9]:
nmc = 50
trainScore = np.zeros(nmc)
testScore = np.zeros(nmc)
for depth in [5, 10, 25]:
    DTC = DecisionTreeClassifier(max_depth = depth)
    for i in range(nmc):
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
        trainFit = DTC.fit(X_train,y_train)
        trainScore[i] = trainFit.score(X_train,y_train)
        testScore[i] =  trainFit.score(X_test,y_test)
    print("When max_depth = " + str(depth) + ":")
    print("The mean of the train data set is {:.4f}".format(np.mean(trainScore)))
    print("The standard deviation of the train data set is {:.4f}".format(np.std(trainScore)))
    print("The mean of the test data set is {:.4f}".format(np.mean(testScore)))
    print("The standard deviation of the test data set is {:.4f}".format(np.std(testScore)))
    print("The probability of beating a benchmark classifier is {}".format(np.mean(testScore>yGuess)))

When max_depth = 5:
The mean of the train data set is 0.7057
The standard deviation of the train data set is 0.0033
The mean of the test data set is 0.6873
The standard deviation of the test data set is 0.0082
The probability of beating a benchmark classifier is 1.0
When max_depth = 10:
The mean of the train data set is 0.7778
The standard deviation of the train data set is 0.0054
The mean of the test data set is 0.6764
The standard deviation of the test data set is 0.0077
The probability of beating a benchmark classifier is 1.0
When max_depth = 25:
The mean of the train data set is 0.9901
The standard deviation of the train data set is 0.0051
The mean of the test data set is 0.6210
The standard deviation of the test data set is 0.0085
The probability of beating a benchmark classifier is 1.0


### 6. Repeat the linear LinearDiscriminant, but use just the real valued data. See example file for line to move just these fields into X.

In [11]:
X = Xall[:,12:23].copy()
nmc = 50
trainScore = np.zeros(nmc)
testScore = np.zeros(nmc)
LDA_real = LinearDiscriminantAnalysis()
for i in range(nmc):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
    trainFit = LDA_real.fit(X_train,y_train)
    trainScore[i] = trainFit.score(X_train,y_train)
    testScore[i] =  trainFit.score(X_test,y_test)
print("The mean of the train data set is {:.4f}".format(np.mean(trainScore)))
print("The standard deviation of the train data set is {:.4f}".format(np.std(trainScore)))
print("The mean of the test data set is {:.4f}".format(np.mean(testScore)))
print("The standard deviation of the test data set is {:.4f}".format(np.std(testScore)))
print("The probability of beating a benchmark classifier is {}".format(np.mean(testScore>yGuess)))

The mean of the train data set is 0.5754
The standard deviation of the train data set is 0.0052
The mean of the test data set is 0.5736
The standard deviation of the test data set is 0.0090
The probability of beating a benchmark classifier is 1.0


### 7. Using the mean test score from parts (1-5+GaussianNB), what was your best method (so far)?

My best method so far is the Decision Tree, which has the high mean scores on both the train data and the test data.