In [5]:
import numpy as np

# some toy data (for debug)
x_train = [[2,1,1,1], #0
     [0,2,0,1], #1
     [1,1,0,1], #0
     [1,0,0,0], #1
     [2,2,0,1]] #0
y_train = [0,1,0,1,0]


x_test = [[2,2,1,1], #0
     [0,2,0,0]] #1
y_test = [0,1]

In [6]:
# MLE of prior p(y=i) = n(i)/N -- not normalizing here, just counting numerator
def p_y(y):
    class_priors = [0]*len(set(y))
    for c in y:
        class_priors[c]+=1    
    return class_priors

In [7]:
# MLE of likelihood p(x=j|y=i) = n(i,j)/n(i)
# Default epsilon smoothing with e=0.001

def p_xy(x,y,py,epsilon=0.001):
    
    # init dict (over classes) of dict (over features) of dict (over value counts)
    outdict = {c:{} for c in y}
    for d in outdict.keys():
        for f in range(len(x[0])):
            outdict[d][f]={}
            rng = set([i[f] for i in x])
            outdict[d][f] = {v:0 for v in rng}
    
    #for k, v in outdict.items():
    #    print("{}\t{}".format(k,v))
    #print()
        
    # fill dict with counts
    for idx,_ in enumerate(x):
        for fidx, _ in enumerate(x[idx]):
            outdict[y[idx]][fidx][x[idx][fidx]]+=1

    #for k, v in outdict.items():
        #print("{}\t{}".format(k,v))
    #print()
            
    # normalize, or fill in epsilons as needed
    for cl in outdict.keys():
        for f in outdict[cl].keys():
            for val in outdict[cl][f]:
                if outdict[cl][f][val] > 0:
                    outdict[cl][f][val] = outdict[cl][f][val] / py[cl]
                else:
                    outdict[cl][f][val] = epsilon
            
    #for k, v in outdict.items():
        #print("{}\t{}".format(k,v))
    return outdict


In [8]:
import math

def predict(x, pc, pxc, epsilon=0.001):
    # sums up prior and independent likelihood terms
    # (fills in epsilons for unseen feature values during training)
    class_probs = []
    for y in range(len(pc)):
        class_prob=pc[y]/sum(pc)
        #print(class_prob)
        for fidx, f in enumerate(x):
            if f in pxc[y][fidx]:
                class_prob = class_prob * pxc[y][fidx][f]
            else:
                class_prob = class_prob * epsilon
            #print(class_prob)
        class_probs.append(class_prob)
    return class_probs, np.argmax([class_probs])
        
    
    
def log_predict(x, pc, pxc, epsilon=0.001):
    # sums up prior and independent likelihood terms
    # (fills in epsilons for unseen feature values during training)
    class_probs = []
    for y in range(len(pc)):
        class_prob=math.log(pc[y]/sum(pc))
        #print(class_prob)
        for fidx, f in enumerate(x):
            if f in pxc[y][fidx]:
                class_prob = class_prob + math.log(pxc[y][fidx][f])
            else:
                class_prob = class_prob + math.log(epsilon)
            #print(class_prob)
        class_probs.append(class_prob)
    return class_probs, np.argmax([class_probs])

In [9]:
py = p_y(y_train)
pxy = p_xy(x_train,y_train,py)

In [11]:
for x in x_test:
    print(predict(x, py, pxy))
    print(log_predict(x, py, pxy))

([0.04444444444444443, 1.0000000000000001e-07], 0)
([-3.113515309210375, -16.118095650958317], 0)
([1.333333333333333e-07, 0.05], 1)
([-15.830413578506537, -2.995732273553991], 1)


In [12]:
import pandas
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def preprocess(filepath, verbose=True):
    
    data = pandas.read_csv(filepath)
    data = data.sample(frac=1, random_state=1).reset_index(drop=True) # shuffle data

    # map string values to integers
    for c in list(data):
        vals = sorted(set([v for v in data[c].values]))
        vals_dict = dict(zip(vals, range(len(vals))))
        data[c] = data[c].map(lambda s: vals_dict.get(s) if s in vals_dict else s)

    data_y = data.iloc[:,-1]
    data_x = data.iloc[:,:-1]

    assert data_y.shape[0] == data_x.shape[0]

    print("N: {}    Feats: {}    Classes: {}    Balance: {}".format(data_x.shape[0], 
                                                     data_x.shape[1], 
                                                     len(set(list(data_y))),
                                                     [(i,list(data_y).count(i)) for i in set(list(data_y))]))
    return data_x, data_y

In [13]:
def split_data(x,y,trainportion=0.9):
    ntrain = int(float(len(x))*trainportion)
    print("{} training examples".format(ntrain))
    return x[:ntrain].reset_index(drop=True), y[:ntrain].reset_index(drop=True), x[ntrain:].reset_index(drop=True), y[ntrain:].reset_index(drop=True)

In [14]:
from sklearn import metrics
def prf1(pred,true):
    return metrics.precision_recall_fscore_support(true, pred)

In [21]:
# load data data
print("\nportugese student data")
bc_x, bc_y = preprocess('student-por_assignment.csv')

# split data and re-format to list
x_tr, y_tr, x_te, y_te = split_data(bc_x, bc_y, 0.8)
bx_train = x_tr.values.tolist()
by_train = y_tr.values.tolist()

bx_test = x_te.values.tolist()
by_test = y_te.values.tolist()

# estimate
py = p_y(by_train)
pxy = p_xy(bx_train,by_train,py)

# predict on train
print("\nevaluation on training data (bad!)")

correct = 0
preds = []
for idx, i in enumerate(bx_train):
    prediction = log_predict(i,py, pxy)[1]
    correct = correct + int(prediction==by_train[idx])
    preds.append(prediction)
precf1 = prf1(preds, by_train)

print("accuracy: {}\npr {}\nre {}\nf1 {}".format(correct / len(bx_train),
                                                precf1[0],
                                                precf1[1],
                                                precf1[2]))

# predict on test
print("\nevaluation on test data (good!)")

correct = 0
preds = []
for idx, i in enumerate(bx_test):
    prediction = log_predict(i,py, pxy)[1]
    correct = correct + int(prediction==by_test[idx])
    preds.append(prediction)
precf1 = prf1(preds, by_test)

print("accuracy: {}\npr {}\nre {}\nf1 {}".format(correct / len(bx_test),
                                                precf1[0],
                                                precf1[1],
                                                precf1[2]))


portugese student data
N: 649    Feats: 29    Classes: 6    Balance: [(0, 65), (1, 17), (2, 112), (3, 154), (4, 201), (5, 100)]
519 training examples

evaluation on training data (bad!)
accuracy: 0.5202312138728323
pr [0.39285714 0.6        0.5        0.5        0.53216374 0.62666667]
re [0.44       0.69230769 0.45054945 0.49586777 0.5617284  0.57317073]
f1 [0.41509434 0.64285714 0.47398844 0.49792531 0.54654655 0.59872611]

evaluation on test data (good!)
accuracy: 0.36153846153846153
pr [0.27272727 0.         0.2        0.37931034 0.44680851 0.38095238]
re [0.2        0.         0.19047619 0.33333333 0.53846154 0.44444444]
f1 [0.23076923 0.         0.19512195 0.35483871 0.48837209 0.41025641]


In [18]:
# load the maths data

# print("\nmaths student data")
# bc_mat, bc_mat = preprocess('student-mat_assignment.csv')

# split data and re-format to list
# # bx_mat = x_tr.values.tolist()
# by_mat = y_tr.values.tolist()


# predict on maths
# print("\nevaluation on maths data (train on portugese)")

# correct = 0
# preds = []
# # for idx, i in enumerate(bx_mat):
#     prediction = predict(i,py, pxy)[1]
#     correct = correct + int(prediction==by_mat[idx])
#     preds.append(prediction)
# precf1 = prf1(preds, by_mat)

# print("accuracy: {}\npr {}\nre {}\nf1 {}".format(correct / len(bx_mat),
#                                                 precf1[0],
#                                                 precf1[1],
#                                                 precf1[2]))

In [20]:
# SAFE version of the data set (conservatively...)
# load portugese data
print("\nportugese student data")
bc_x, bc_y = preprocess('student-por_safe2.csv')

# split data and re-format to list
x_tr, y_tr, x_te, y_te = split_data(bc_x, bc_y, 0.8)
bx_train = x_tr.values.tolist()
by_train = y_tr.values.tolist()

bx_test = x_te.values.tolist()
by_test = y_te.values.tolist()

# estimate
py = p_y(by_train)
pxy = p_xy(bx_train,by_train,py)

# predict on train
print("\nevaluation on training data (bad!)")

correct = 0
preds = []
for idx, i in enumerate(bx_train):
    prediction = log_predict(i,py, pxy)[1]
    correct = correct + int(prediction==by_train[idx])
    preds.append(prediction)
precf1 = prf1(preds, by_train)

print("accuracy: {}\npr {}\nre {}\nf1 {}".format(correct / len(bx_train),
                                                precf1[0],
                                                precf1[1],
                                                precf1[2]))

# predict on test
print("\nevaluation on test data (good!)")

correct = 0
preds = []
for idx, i in enumerate(bx_test):
    prediction = log_predict(i,py, pxy)[1]
    correct = correct + int(prediction==by_test[idx])
    preds.append(prediction)
precf1 = prf1(preds, by_test)

print("accuracy: {}\npr {}\nre {}\nf1 {}".format(correct / len(bx_test),
                                                precf1[0],
                                                precf1[1],
                                                precf1[2]))


portugese student data
N: 649    Feats: 6    Classes: 6    Balance: [(0, 65), (1, 17), (2, 112), (3, 154), (4, 201), (5, 100)]
519 training examples

evaluation on training data (bad!)
accuracy: 0.4258188824662813
pr [0.33333333 1.         0.33333333 0.38709677 0.42857143 0.63076923]
re [0.02       0.07692308 0.27472527 0.59504132 0.5        0.5       ]
f1 [0.03773585 0.14285714 0.30120482 0.46905537 0.46153846 0.55782313]

evaluation on test data (good!)
accuracy: 0.35384615384615387
pr [0.         0.         0.25       0.33333333 0.38888889 0.42105263]
re [0.         0.         0.19047619 0.39393939 0.53846154 0.44444444]
f1 [0.         0.         0.21621622 0.36111111 0.4516129  0.43243243]


**Qusetion 5**

- We don't want to use attributes like sex or education of mother to judge an applicants' ability. This should happen purely based on objective performance / attitude features (see next point)
- I used attributes 1,11,14,15,21,30: school, reason, studytime, failures, higher, absences
- The test performance hardly changes at all, for some classes it even goes up (the fit of the training data decreases, actually -- so: less *overfitting*)
- why deleting features isn't enough? Because sensitive features (e.g., sex) correlate with others (e.g., studytime if traditionally we assume that girls had to help out more in the household, thus less time to study)


