# Assignment 2 - Part C: Trying alternative classifiers

This is a skeleton for trying alternative classifiers on the basketball dataset.

In [99]:
import csv

We can define, as done in Practicum 6, a data loading in a way to obtain the attributes set and class labels for each the training and the test sets.

In [100]:
ATTRS = ["LOCATION", "W", "FINAL_MARGIN", "SHOT_NUMBER", "PERIOD", "GAME_CLOCK", "SHOT_CLOCK", "DRIBBLES", "TOUCH_TIME",
         "SHOT_DIST", "PTS_TYPE", "CLOSE_DEF_DIST", "SHOT_RESULT"]
ATTRS_WO_CLASS = 12

def load_data(filename):
    train_x = []
    train_y = []
    test_x = []
    test_y = []
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            if len(row) == ATTRS_WO_CLASS + 1:
                i += 1
                instance = [row[i] for i in range(ATTRS_WO_CLASS)]  # first ATTRS_WO_CLASS values are attributes
                label = row[ATTRS_WO_CLASS]  # (ATTRS_WO_CLASS + 1)th value is the class label
                if i % 3 == 0:  # test instance
                    test_x.append(instance)
                    test_y.append(label)
                else:  # train instance
                    train_x.append(instance)
                    train_y.append(label)
                    
    return train_x, train_y, test_x, test_y

And then we can use it to load the data.

In [101]:
train_x, train_y, test_x, test_y = load_data("data/basketball.train.csv")

Scikit-learn needs that all the attribute values to be numeric. This is, we need to binarize all the non-numeric attribute values, to obtain vectors: records having only numbers. The `DictVectorizer` class provided by scikit-learn allows to do this easily.

In [102]:
from sklearn.feature_extraction import DictVectorizer

Mind that each `train_x` and `test_x` are a list of lists.

We just need to obtain from each a list of dictionaries (as done in previous practica where each record was a dictionary).

In [103]:
dicts_train_x = []
excluded_attrs = ["LOCATION", "W", "PERIOD", "PTS_TYPE"]
for x in train_x:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS) - 1: # we removed class from train_x elems
            val = x[i]
            if attr not in excluded_attrs:
                val = float(val)
            d[attr] = val
    dicts_train_x.append(d)

Finally, the `fit_transform` method of the vectorizer binarizes the non-numeric attributes in the list of dictionaries, and returns the vector we need.

In [104]:
vectorizer_train = DictVectorizer()
vec_train_x = vectorizer_train.fit_transform(dicts_train_x).toarray()

We do similarly for vectorizing `test_x`.

In [105]:
dicts_test_x = []
for x in test_x:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS) - 1: # we removed class from train_x elems
            val = x[i]
            if attr not in excluded_attrs:
                val = float(val)
            d[attr] = val
    dicts_test_x.append(d)
    
vectorizer_test = DictVectorizer()
vec_test_x = vectorizer_train.fit_transform(dicts_test_x).toarray()

Having `evaluate` defined somewhere, we are ready to learn and apply the model, similarly to Task 3 of Practicum 6. But here, we use the vectors recently obtained for the input sets. E.g., for Naive Bayes classifier:

In [106]:
def evaluate(predictions, true_labels):
    correct = 0
    incorrect = 0
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            correct += 1
        else:
            incorrect += 1

    print("\tAccuracy:   ", correct / len(predictions))
    print("\tError rate: ", incorrect / len(predictions))
    return correct / len(predictions)

In [107]:
def load_test_data(filename):
    test_x = []  # instances to be tested
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            if len(row) == ATTRS_WO_CLASS:
                i += 1
                instance = [row[i] for i in range(ATTRS_WO_CLASS)]  
                test_x.append(instance)  
    return test_x

In [108]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier

best_pred = 0
x_prior = 0
y_prior = 0
for x in range(1, 1000):
    print("{} of 999".format(x))
    clf = GaussianNB(priors=[0.001 * x, 1 - (0.001 * x)])
    clf.fit(vec_train_x, train_y)
    predictions = clf.predict(vec_test_x)
    prediction = evaluate(predictions, test_y)
    if prediction:
        if prediction > best_pred:
            best_pred = prediction
            x_prior = 0.001*x
            y_prior = 1 - (0.001*x)

best_pred_ada = 0
learn_rate = 0
for x in range(1, 100):
    print("{} of 99".format(x))
    clf = AdaBoostClassifier(learning_rate=0.001*x)
    clf.fit(vec_train_x, train_y)
    predictions = clf.predict(vec_test_x)
    prediction = evaluate(predictions, test_y)
    if prediction:
        if prediction > best_pred_ada:
            best_pred_ada = prediction
            learn_rate = 0.01*x
test_data_x = load_test_data("data/basketball.test.csv")

dicts_test_x = []
for x in test_data_x:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS) - 1: # we removed class from train_x elems
            val = x[i]
            if attr not in excluded_attrs:
                val = float(val)
            d[attr] = val
    dicts_test_x.append(d)
    
vectorizer_test = DictVectorizer()
vec_test_x = vectorizer_train.fit_transform(dicts_test_x).toarray()



if best_pred_ada > best_pred:
    print("AdaBoost using learn_rate={}".format(learn_rate))
    clf = AdaBoostClassifier(learning_rate=learn_rate)
else:
    print("GaussianNB - Parameters [{0}, {1}]".format(x_prior, y_prior))
    clf = GaussianNB(priors=[x_prior, y_prior])
clf.fit(vec_train_x, train_y)
predictions = clf.predict(vec_test_x)

OUTPUT_FILE = "data/classifier_basketball.pred.csv"

print("Best parameters for GaussianNB: priors=[{}, {}]".format(x_prior, y_prior))
print("Best parameter for AdaBoost: learning_rate={}".format(learn_rate))
with open(OUTPUT_FILE, 'w') as output:
    output.write("Id,Target")
    for id, pred in enumerate(predictions):
        output.write("\n{},{}".format(id + 1, pred))
print("Prediction written to {}".format(OUTPUT_FILE))


1 of 999
	Accuracy:    0.4995153864792828
	Error rate:  0.5004846135207173
2 of 999
	Accuracy:    0.4996365398594621
	Error rate:  0.5003634601405379
3 of 999
	Accuracy:    0.4996365398594621
	Error rate:  0.5003634601405379
4 of 999
	Accuracy:    0.4996769243195219
	Error rate:  0.5003230756804782
5 of 999
	Accuracy:    0.4996769243195219
	Error rate:  0.5003230756804782
6 of 999
	Accuracy:    0.49979807769970114
	Error rate:  0.5002019223002988
7 of 999
	Accuracy:    0.49979807769970114
	Error rate:  0.5002019223002988
8 of 999
	Accuracy:    0.49979807769970114
	Error rate:  0.5002019223002988
9 of 999
	Accuracy:    0.4998384621597609
	Error rate:  0.5001615378402391
10 of 999
	Accuracy:    0.49979807769970114
	Error rate:  0.5002019223002988
11 of 999
	Accuracy:    0.49979807769970114
	Error rate:  0.5002019223002988
12 of 999
	Accuracy:    0.49979807769970114
	Error rate:  0.5002019223002988
13 of 999
	Accuracy:    0.49975769323964137
	Error rate:  0.5002423067603586
14 of 999
	Acc

	Error rate:  0.4967692431952185
111 of 999
	Accuracy:    0.503311525724901
	Error rate:  0.49668847427509893
112 of 999
	Accuracy:    0.5035134480251999
	Error rate:  0.49648655197480007
113 of 999
	Accuracy:    0.5035538324852596
	Error rate:  0.49644616751474036
114 of 999
	Accuracy:    0.503674985865439
	Error rate:  0.49632501413456104
115 of 999
	Accuracy:    0.5040788304660366
	Error rate:  0.4959211695339633
116 of 999
	Accuracy:    0.5042403683062757
	Error rate:  0.4957596316937243
117 of 999
	Accuracy:    0.5044019061465148
	Error rate:  0.4955980938534852
118 of 999
	Accuracy:    0.5047653662870527
	Error rate:  0.49523463371294724
119 of 999
	Accuracy:    0.5052095953477101
	Error rate:  0.4947904046522898
120 of 999
	Accuracy:    0.5050480575074712
	Error rate:  0.4949519424925289
121 of 999
	Accuracy:    0.5052095953477101
	Error rate:  0.4947904046522898
122 of 999
	Accuracy:    0.5055730554882482
	Error rate:  0.4944269445117519
123 of 999
	Accuracy:    0.5054115176480

	Accuracy:    0.5484613520717228
	Error rate:  0.4515386479282772
219 of 999
	Accuracy:    0.5483805831516033
	Error rate:  0.45161941684839674
220 of 999
	Accuracy:    0.5491882723527987
	Error rate:  0.45081172764720134
221 of 999
	Accuracy:    0.5495921169533964
	Error rate:  0.4504078830466037
222 of 999
	Accuracy:    0.5500363460140538
	Error rate:  0.44996365398594623
223 of 999
	Accuracy:    0.550520959534771
	Error rate:  0.44947904046522896
224 of 999
	Accuracy:    0.5516517244164445
	Error rate:  0.44834827558355544
225 of 999
	Accuracy:    0.5520151845569825
	Error rate:  0.44798481544301755
226 of 999
	Accuracy:    0.5523786446975204
	Error rate:  0.4476213553024796
227 of 999
	Accuracy:    0.5528228737581778
	Error rate:  0.44717712624182215
228 of 999
	Accuracy:    0.5534286406590744
	Error rate:  0.4465713593409256
229 of 999
	Accuracy:    0.5534690251191342
	Error rate:  0.44653097488086585
230 of 999
	Accuracy:    0.5533882561990147
	Error rate:  0.4466117438009854
231

	Accuracy:    0.5813746870204345
	Error rate:  0.41862531297956546
325 of 999
	Accuracy:    0.5810919958000161
	Error rate:  0.41890800419998386
326 of 999
	Accuracy:    0.5814150714804943
	Error rate:  0.4185849285195057
327 of 999
	Accuracy:    0.5816977627009127
	Error rate:  0.41830223729908733
328 of 999
	Accuracy:    0.5817785316210322
	Error rate:  0.4182214683789678
329 of 999
	Accuracy:    0.5814958404006139
	Error rate:  0.41850415959938614
330 of 999
	Accuracy:    0.5815766093207334
	Error rate:  0.4184233906792666
331 of 999
	Accuracy:    0.5816977627009127
	Error rate:  0.41830223729908733
332 of 999
	Accuracy:    0.5814150714804943
	Error rate:  0.4185849285195057
333 of 999
	Accuracy:    0.581657378240853
	Error rate:  0.4183426217591471
334 of 999
	Accuracy:    0.5815766093207334
	Error rate:  0.4184233906792666
335 of 999
	Accuracy:    0.5810112268798966
	Error rate:  0.4189887731201034
336 of 999
	Accuracy:    0.5806881511994184
	Error rate:  0.41931184880058153
337 o

	Accuracy:    0.5749535578709313
	Error rate:  0.4250464421290687
433 of 999
	Accuracy:    0.5747516355706325
	Error rate:  0.4252483644293676
434 of 999
	Accuracy:    0.5749535578709313
	Error rate:  0.4250464421290687
435 of 999
	Accuracy:    0.574832404490752
	Error rate:  0.42516759550924804
436 of 999
	Accuracy:    0.5742670220499152
	Error rate:  0.4257329779500848
437 of 999
	Accuracy:    0.5737420240691382
	Error rate:  0.4262579759308618
438 of 999
	Accuracy:    0.5736208706889588
	Error rate:  0.4263791293110411
439 of 999
	Accuracy:    0.5736208706889588
	Error rate:  0.4263791293110411
440 of 999
	Accuracy:    0.5733785639286003
	Error rate:  0.42662143607139974
441 of 999
	Accuracy:    0.5732170260883612
	Error rate:  0.4267829739116388
442 of 999
	Accuracy:    0.5732574105484209
	Error rate:  0.42674258945157906
443 of 999
	Accuracy:    0.5730554882481221
	Error rate:  0.42694451175187786
444 of 999
	Accuracy:    0.5730958727081819
	Error rate:  0.4269041272918181
445 of 

	Accuracy:    0.5648978273160488
	Error rate:  0.43510217268395124
541 of 999
	Accuracy:    0.5650997496163477
	Error rate:  0.4349002503836524
542 of 999
	Accuracy:    0.5649382117761086
	Error rate:  0.43506178822389147
543 of 999
	Accuracy:    0.5648170583959292
	Error rate:  0.4351829416040707
544 of 999
	Accuracy:    0.5644939827154511
	Error rate:  0.4355060172845489
545 of 999
	Accuracy:    0.5643728293352718
	Error rate:  0.4356271706647282
546 of 999
	Accuracy:    0.564170907034973
	Error rate:  0.43582909296502703
547 of 999
	Accuracy:    0.5642516759550925
	Error rate:  0.43574832404490754
548 of 999
	Accuracy:    0.5640497536547936
	Error rate:  0.43595024634520635
549 of 999
	Accuracy:    0.5641305225749131
	Error rate:  0.4358694774250868
550 of 999
	Accuracy:    0.5640901381148534
	Error rate:  0.4359098618851466
551 of 999
	Accuracy:    0.563968984734674
	Error rate:  0.4360310152653259
552 of 999
	Accuracy:    0.5637670624343752
	Error rate:  0.43623293756562476
553 of

	Accuracy:    0.5568209353040949
	Error rate:  0.443179064695905
648 of 999
	Accuracy:    0.5568613197641548
	Error rate:  0.44313868023584524
649 of 999
	Accuracy:    0.5569017042242145
	Error rate:  0.44309829577578547
650 of 999
	Accuracy:    0.5567401663839755
	Error rate:  0.44325983361602456
651 of 999
	Accuracy:    0.5566190130037961
	Error rate:  0.4433809869962039
652 of 999
	Accuracy:    0.5568209353040949
	Error rate:  0.443179064695905
653 of 999
	Accuracy:    0.5567805508440352
	Error rate:  0.4432194491559648
654 of 999
	Accuracy:    0.556659397463856
	Error rate:  0.4433406025361441
655 of 999
	Accuracy:    0.5565382440836766
	Error rate:  0.4434617559163234
656 of 999
	Accuracy:    0.5564978596236169
	Error rate:  0.4435021403763832
657 of 999
	Accuracy:    0.5564170907034973
	Error rate:  0.4435829092965027
658 of 999
	Accuracy:    0.5562151684031984
	Error rate:  0.44378483159680154
659 of 999
	Accuracy:    0.5562555528632582
	Error rate:  0.44374444713674177
660 of 9

	Accuracy:    0.5528228737581778
	Error rate:  0.44717712624182215
756 of 999
	Accuracy:    0.5528228737581778
	Error rate:  0.44717712624182215
757 of 999
	Accuracy:    0.5524997980776997
	Error rate:  0.4475002019223003
758 of 999
	Accuracy:    0.5522171068572813
	Error rate:  0.4477828931427187
759 of 999
	Accuracy:    0.5523786446975204
	Error rate:  0.4476213553024796
760 of 999
	Accuracy:    0.5522574913173411
	Error rate:  0.4477425086826589
761 of 999
	Accuracy:    0.5520151845569825
	Error rate:  0.44798481544301755
762 of 999
	Accuracy:    0.5516113399563848
	Error rate:  0.4483886600436152
763 of 999
	Accuracy:    0.5514901865762055
	Error rate:  0.44850981342379453
764 of 999
	Accuracy:    0.5514901865762055
	Error rate:  0.44850981342379453
765 of 999
	Accuracy:    0.5515305710362652
	Error rate:  0.44846942896373476
766 of 999
	Accuracy:    0.5513690331960261
	Error rate:  0.44863096680397385
767 of 999
	Accuracy:    0.5511671108957273
	Error rate:  0.44883288910427266
76

	Accuracy:    0.5346094822712221
	Error rate:  0.465390517728778
863 of 999
	Accuracy:    0.5341652532105646
	Error rate:  0.4658347467894354
864 of 999
	Accuracy:    0.5336806396898474
	Error rate:  0.46631936031015264
865 of 999
	Accuracy:    0.5332767950892496
	Error rate:  0.46672320491075037
866 of 999
	Accuracy:    0.533397948469429
	Error rate:  0.46660205153057105
867 of 999
	Accuracy:    0.533397948469429
	Error rate:  0.46660205153057105
868 of 999
	Accuracy:    0.5330748727889508
	Error rate:  0.4669251272110492
869 of 999
	Accuracy:    0.5331152572490105
	Error rate:  0.4668847427509894
870 of 999
	Accuracy:    0.5329941038688313
	Error rate:  0.4670058961311687
871 of 999
	Accuracy:    0.5321460302075761
	Error rate:  0.4678539697924239
872 of 999
	Accuracy:    0.5317421856069784
	Error rate:  0.46825781439302155
873 of 999
	Accuracy:    0.5315402633066796
	Error rate:  0.4684597366933204
874 of 999
	Accuracy:    0.5311364187060819
	Error rate:  0.4688635812939181
875 of 9

	Accuracy:    0.5049672885873516
	Error rate:  0.49503271141264843
971 of 999
	Accuracy:    0.5049269041272918
	Error rate:  0.4950730958727082
972 of 999
	Accuracy:    0.5048865196672321
	Error rate:  0.495113480332768
973 of 999
	Accuracy:    0.5048057507471125
	Error rate:  0.49519424925288746
974 of 999
	Accuracy:    0.5048461352071723
	Error rate:  0.49515386479282775
975 of 999
	Accuracy:    0.5047653662870527
	Error rate:  0.49523463371294724
976 of 999
	Accuracy:    0.5047653662870527
	Error rate:  0.49523463371294724
977 of 999
	Accuracy:    0.5048057507471125
	Error rate:  0.49519424925288746
978 of 999
	Accuracy:    0.504724981826993
	Error rate:  0.495275018173007
979 of 999
	Accuracy:    0.5044826750666344
	Error rate:  0.49551732493336564
980 of 999
	Accuracy:    0.5046038284468136
	Error rate:  0.4953961715531863
981 of 999
	Accuracy:    0.5046038284468136
	Error rate:  0.4953961715531863
982 of 999
	Accuracy:    0.5045634439867539
	Error rate:  0.4954365560132461
983 of

	Accuracy:    0.5949842500605766
	Error rate:  0.4050157499394233
80 of 99
	Accuracy:    0.5947419433002181
	Error rate:  0.40525805669978193
81 of 99
	Accuracy:    0.595105403440756
	Error rate:  0.404894596559244
82 of 99
	Accuracy:    0.5952265568209353
	Error rate:  0.4047734431790647
83 of 99
	Accuracy:    0.595105403440756
	Error rate:  0.404894596559244
84 of 99
	Accuracy:    0.5953477102011147
	Error rate:  0.4046522897988854
85 of 99
	Accuracy:    0.5955092480413536
	Error rate:  0.4044907519586463
86 of 99
	Accuracy:    0.5963169372425491
	Error rate:  0.4036830627574509
87 of 99
	Accuracy:    0.59615539940231
	Error rate:  0.40384460059769
88 of 99
	Accuracy:    0.5961957838623697
	Error rate:  0.40380421613763023
89 of 99
	Accuracy:    0.595630401421533
	Error rate:  0.404369598578467
90 of 99
	Accuracy:    0.5963977061626686
	Error rate:  0.40360229383733137
91 of 99
	Accuracy:    0.5961957838623697
	Error rate:  0.40380421613763023
92 of 99
	Accuracy:    0.596276552782489