
<h1 id="Classification">Classification<a class="anchor-link" href="#Classification">¶</a></h1>


In [None]:

import numpy
import urllib
import scipy.optimize
import random
from sklearn import svm # Library for SVM classification
from sklearn import linear_model # Logistic Regression (among other things)
import ast



In [None]:

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)



In [None]:

def parseDataFromFile(fname):
  for l in open(fname):
    yield eval(l)



In [None]:

data1 = list(parseDataFromFile("/Users/lizhaoyi/data/book_descriptions_50000.json"))



In [None]:

data2 = list(parseDataFromFile("/Users/lizhaoyi/data/book_images_5000.json"))



In [None]:

data = data1



In [None]:

data[2]




<p>Priors...</p>



<p>p(label)</p>


In [None]:

prior = ["Children's Books" in d['categories'] for d in data]
prior = sum(prior) / len(prior)



In [None]:

prior



In [None]:

prior_neg = 1 - prior



In [None]:

prior_neg




<p>p(wizard in description | Children's Book)</p>


In [None]:

p1 = ["wizard" in b['description'] for b in data
      if "Children's Books" in b['categories']]
p1 = sum(p1) / len(p1)



In [None]:

p1



In [None]:

p1_neg = ["wizard" in b['description'] for b in data
         if not ("Children's Books" in b['categories'])]
p1_neg = sum(p1_neg) / len(p1_neg)



In [None]:

p1_neg




<p>p(witch in description | Children's Book)</p>


In [None]:

p2 = ["witch" in b['description'] for b in data if "Children's Books" in b['categories']]
p2 = sum(p2) * 1.0 / len(p2)



In [None]:

p2



In [None]:

p2_neg = ["witch" in b['description'] for b in data if not "Children's Books" in b['categories']]
p2_neg = sum(p2_neg) * 1.0 / len(p2_neg)



In [None]:

p2_neg



In [None]:

score = prior * p1 * p2
score_neg = prior_neg * p1_neg * p2_neg



In [None]:

score / score_neg



In [None]:

p = ["Children's Books" in b['categories'] for b in data if
     'witch' in b['description'] and 'wizard' in b['description']]
p = sum(p) * 1.0 / len(p)



In [None]:

p / (1 - p)




<p>Classification - Judging a book by its cover</p>


In [None]:

data = data2



In [None]:

data[0]



In [None]:

data[0]['image_feature']



In [None]:

X = [b['image_feature'] for b in data]
y = ["Children's Books" in b['categories'] for b in data]



In [None]:

X_train = X[:len(X)//2]
y_train = y[:len(y)//2]

X_test = X[len(X)//2:]
y_test = y[len(y)//2:]



In [None]:

mod = linear_model.LogisticRegression(C=1.0)
mod.fit(X_train, y_train)



In [None]:

train_predictions = mod.predict(X_train)
test_predictions = mod.predict(X_test)



In [None]:

# accuracy
sum(test_predictions == y_test) / len(y_test)



In [None]:

y_false = numpy.array([False for b in y_test])



In [None]:

# accuracy when predicting "false" always
sum(y_false == y_test) / len(y_test)




<h1 id="Diagnostics">Diagnostics<a class="anchor-link" href="#Diagnostics">¶</a></h1>



<p>Data from:
<a href="https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data">https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data</a></p>


In [None]:

f = open("/users/lizhaoyi/data/bankruptcy.arff", 'r')



In [None]:

while not '@data' in f.readline():
    pass



In [None]:

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)



In [None]:

dataset[0]




<p>Data setup</p>


In [None]:

X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]




<p>Fit model</p>


In [None]:

mod = linear_model.LogisticRegression(C=1.0)
mod.fit(X,y)



In [None]:

pred = mod.predict(X)



In [None]:

sum(pred == y) / len(y)



In [None]:

sum(y)



In [None]:

len(y)



In [None]:

1 - 102/3031



In [None]:

sum(pred)




<p>Balanced model</p>


In [None]:

mod = linear_model.LogisticRegression(C=1.0, class_weight = 'balanced')



In [None]:

mod.fit(X,y)



In [None]:

pred = mod.predict(X)



In [None]:

sum(pred == y) / len(y)



In [None]:

# How many positive predictions?
sum(pred)




<p>Train/validation/test splits</p>


In [None]:

Xy = list(zip(X,y))



In [None]:

random.shuffle(Xy)



In [None]:

X = [d[0] for d in Xy]
y = [d[1] for d in Xy]



In [None]:

N = len(y)



In [None]:

Ntrain = 1000
Nvalid = 1000
Ntest = 1031



In [None]:

Xtrain = X[:Ntrain]
Xvalid = X[Ntrain:Ntrain+Nvalid]
Xtest = X[Ntrain+Nvalid:]



In [None]:

ytrain = y[:Ntrain]
yvalid = y[Ntrain:Ntrain+Nvalid]
ytest = y[Ntrain+Nvalid:]



In [None]:

mod.fit(Xtrain, ytrain)




<p>TP, TN, FP, FN, Accuracy, BER</p>


In [None]:

pred = mod.predict(Xtest)



In [None]:

correct = pred == ytest



In [None]:

TP_ = numpy.logical_and(pred, ytest)
FP_ = numpy.logical_and(pred, numpy.logical_not(ytest))
TN_ = numpy.logical_and(numpy.logical_not(pred), numpy.logical_not(ytest))
FN_ = numpy.logical_and(numpy.logical_not(pred), ytest)



In [None]:

TP = sum(TP_)
FP = sum(FP_)
TN = sum(TN_)
FN = sum(FN_)



In [None]:

# accuracy
(TP + TN) / (TP + FP + TN + FN)



In [None]:

sum(correct) / len(correct)



In [None]:

# BER
1 - 0.5*(TP / (TP + FN) + TN / (TN + FP))




<p>Ranking</p>


In [None]:

scores = mod.decision_function(Xtest)



In [None]:

scores



In [None]:

scoreslabels = list(zip(scores, ytest))



In [None]:

scoreslabels.sort(reverse=True)



In [None]:

scoreslabels



In [None]:

sortedlabels = [x[1] for x in scoreslabels]



In [None]:

sortedlabels



In [None]:

# precision
retrieved = sum(pred)
relevant = sum(ytest)
intersection = sum([y and p for y,p in zip(ytest,pred)])



In [None]:

intersection / retrieved



In [None]:

# recall
intersection / relevant



In [None]:

# precision at 10
sum(sortedlabels[:10]) / 10

