In [1]:
import pandas
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
epsilon = 1e-12
train_path = "ag_news_csv/train2.csv"
train_text = pandas.read_csv(train_path, header=None)
train_body = train_text.iloc[:,2] # we are just using the body of the news for train
train_category = train_text.iloc[:,0]-1  # train categories of the news clusters
train_vectorizer = TfidfVectorizer()
train_body = train_vectorizer.fit_transform(train_body).todense()
categories = list(set(train_category))
ndocs = train_body.shape[0]
valid_indices = np.random.choice(ndocs, int(0.1*ndocs), replace = False)
train_indices = list(map(int, set(range(ndocs)).difference(set(valid_indices))))

In [2]:
test_path = "ag_news_csv/test2.csv"
test_text = pandas.read_csv(test_path, header=None)
test_body = test_text.iloc[:,2] # we are just using the body of the news for test
test_category = test_text.iloc[:,0]-1 # test categories of the news clusters
test_vectorizer = TfidfVectorizer(vocabulary = train_vectorizer.get_feature_names(), use_idf=False)
test_body = test_vectorizer.fit_transform(test_body).multiply(train_vectorizer.idf_).tocsr().todense()

In [3]:
def accuracy_calc(exp, rel):
    acc =  (sum(rel==exp)/len(rel))
    print ("Accuracy is " + str(acc) + ".")
    F1 = []
    for category in categories:
        tp = sum((exp == rel) & (exp == category))
        t = sum(exp == category)
        fn = sum((exp != category) & (rel == category))
        p = tp/t
        r = tp/(tp+fn)
        F1.append(2*p*r/(p+r))
        print("Category" + str(category) + " : Precision is " + str(p) + " & Recall is "+ str(r)+ " & F1 is " + str(F1[-1])+".")
    print("Overall F1 score is "+ str(sum(F1)/4)+".")

In [8]:
def find_category_knn_train(req, k):    
    distances = []
    for row_index in train_indices:
        distances.append(np.linalg.norm(req - train_body[row_index,]))
    distance_indices = np.ndarray.tolist(np.argsort(distances)[0:k])
    indices = [val for index, val in enumerate(train_indices) if index in distance_indices]
    category = sp.stats.mode(train_category[indices])[0][0]
    return (category)
def find_category_knn(req, k):    
    distances = [0] * train_body.shape[0]
    for row_index in range(0, train_body.shape[0]):
        distances[row_index] = np.linalg.norm(req - train_body[row_index,])
    category = sp.stats.mode(train_category[np.argsort(distances)[0:k]])[0][0]
    return (category)

In [70]:
for k in [1, 5, 10]:
    knn_categories = np.asarray([find_category_knn_train(train_body[doc_index,], k) for doc_index in valid_indices])
    print("accuracy for k = " + str(k) + " is :" + str(accuracy_calc(knn_categories, train_category[valid_indices])))

Accuracy is 0.835.
Category0 : Precision is 0.7818181818181819 & Rcall is 0.8269230769230769 & F1 is 0.8037383177570092.
Category1 : Precision is 0.9130434782608695 & Rcall is 0.8811188811188811 & F1 is 0.896797153024911.
Category2 : Precision is 0.83125 & Rcall is 0.8417721518987342 & F1 is 0.8364779874213837.
Category3 : Precision is 0.8248175182481752 & Rcall is 0.7902097902097902 & F1 is 0.8071428571428572.
Overall F1 score is 0.8360390788365402.
accuracy for k = 1 is :None
Accuracy is 0.8566666666666667.
Category0 : Precision is 0.8353658536585366 & Rcall is 0.8782051282051282 & F1 is 0.85625.
Category1 : Precision is 0.8823529411764706 & Rcall is 0.9440559440559441 & F1 is 0.9121621621621621.
Category2 : Precision is 0.8282208588957055 & Rcall is 0.8544303797468354 & F1 is 0.8411214953271028.
Category3 : Precision is 0.8916666666666667 & Rcall is 0.7482517482517482 & F1 is 0.8136882129277566.
Overall F1 score is 0.8558054676042554.
accuracy for k = 5 is :None
Accuracy is 0.863333

In [10]:
knn_categories = np.asarray([find_category_knn(test_body[doc_index,], 10) for doc_index in range(test_body.shape[0])])
print("accuracy for k = " + str(10) + " is :" + str(accuracy_calc(knn_categories, test_category)))

Accuracy is 0.8416666666666667.
Category0 : Precision is 0.8639455782312925 & Recall is 0.8466666666666667 & F1 is 0.8552188552188553.
Category1 : Precision is 0.863905325443787 & Recall is 0.9733333333333334 & F1 is 0.9153605015673981.
Category2 : Precision is 0.7763975155279503 & Recall is 0.8333333333333334 & F1 is 0.8038585209003215.
Category3 : Precision is 0.8699186991869918 & Recall is 0.7133333333333334 & F1 is 0.7838827838827839.
Overall F1 score is 0.8395801653923397.
accuracy for k = 5 is :None


In [4]:
#rows:classes, columns:words, cells:(mean, variance)
mv = np.zeros((len(categories), train_body.shape[1], 2))
for category in categories:   
    mv[category, :, 0] = np.mean(train_body[train_category == category], axis = 0)
    mv[category, :, 1] = np.var(train_body[train_category == category], axis = 0)
mv[mv[:,:,1] == 0] = epsilon

In [5]:
def find_category_bayes(req):
    log_probs = [0] * len(categories)
    for category in categories:
        log_probs[category] = np.sum(sp.stats.norm(mv[category, :, 0],mv[category, :, 1]).logpdf(req))
    mle = log_probs.index(max(log_probs))
    return (mle)

In [6]:
bayes_categories = np.asarray([find_category_bayes(test_body[doc_index,]) for doc_index in range(test_body.shape[0])])

In [7]:
accuracy_calc(bayes_categories, test_category)

Accuracy is 0.7816666666666666.
Category0 : Precision is 0.76 & Recall is 0.76 & F1 is 0.76.
Category1 : Precision is 0.8827586206896552 & Recall is 0.8533333333333334 & F1 is 0.8677966101694914.
Category2 : Precision is 0.7803030303030303 & Recall is 0.6866666666666666 & F1 is 0.7304964539007092.
Category3 : Precision is 0.7167630057803468 & Recall is 0.8266666666666667 & F1 is 0.7678018575851393.
Overall F1 score is 0.7815237304138349.


In [14]:
from sklearn import svm
for c in [2,1.5,1,0.5]:
    svm_clf = svm.LinearSVC(C = c)
    svm_clf.fit(X = train_body[train_indices,:], y = train_category[train_indices])
    svm_categories = np.asarray(svm_clf.predict(train_body[valid_indices]))
    print("C = "+ str(c)+ ":")
    accuracy_calc(svm_categories ,train_category[valid_indices])

C = 2:
Accuracy is 0.8633333333333333.
Category0 : Precision is 0.8433734939759037 & Recall is 0.8805031446540881 & F1 is 0.8615384615384616.
Category1 : Precision is 0.9241379310344827 & Recall is 0.9178082191780822 & F1 is 0.9209621993127147.
Category2 : Precision is 0.8273381294964028 & Recall is 0.8098591549295775 & F1 is 0.8185053380782918.
Category3 : Precision is 0.86 & Recall is 0.8431372549019608 & F1 is 0.8514851485148515.
Overall F1 score is 0.8631227868610798.
C = 1.5:
Accuracy is 0.8666666666666667.
Category0 : Precision is 0.844311377245509 & Recall is 0.8867924528301887 & F1 is 0.8650306748466258.
Category1 : Precision is 0.9241379310344827 & Recall is 0.9178082191780822 & F1 is 0.9209621993127147.
Category2 : Precision is 0.8394160583941606 & Recall is 0.8098591549295775 & F1 is 0.8243727598566308.
Category3 : Precision is 0.8609271523178808 & Recall is 0.8496732026143791 & F1 is 0.8552631578947367.
Overall F1 score is 0.866407197977677.
C = 1:
Accuracy is 0.87333333333

In [15]:
svm_clf = svm.LinearSVC(C = 0.5)
svm_clf.fit(X = train_body, y = train_category)
svm_categories = np.asarray(svm_clf.predict(test_body))
print("C = "+ str(c)+ ":")
accuracy_calc(svm_categories ,test_category)

C = 0.5:
Accuracy is 0.8433333333333334.
Category0 : Precision is 0.8657718120805369 & Recall is 0.86 & F1 is 0.8628762541806019.
Category1 : Precision is 0.8980891719745223 & Recall is 0.94 & F1 is 0.9185667752442996.
Category2 : Precision is 0.7625 & Recall is 0.8133333333333334 & F1 is 0.7870967741935484.
Category3 : Precision is 0.8507462686567164 & Recall is 0.76 & F1 is 0.8028169014084507.
Overall F1 score is 0.8428391762567251.


In [11]:
from sklearn.ensemble import RandomForestClassifier
rfc_clf = RandomForestClassifier(n_estimators=200, max_depth=6,
                            random_state=1)
rfc_clf.fit(train_body, train_category)
rfc_categories = np.asarray(rfc_clf.predict(test_body))

In [12]:
accuracy_calc(rfc_categories, test_category)

Accuracy is 0.7383333333333333.
Category0 : Precision is 0.7887323943661971 & Recall is 0.7466666666666667 & F1 is 0.767123287671233.
Category1 : Precision is 0.81875 & Recall is 0.8733333333333333 & F1 is 0.8451612903225806.
Category2 : Precision is 0.7205882352941176 & Recall is 0.6533333333333333 & F1 is 0.6853146853146852.
Category3 : Precision is 0.6296296296296297 & Recall is 0.68 & F1 is 0.6538461538461539.
Overall F1 score is 0.7378613542886631.


In [92]:
ttrain_category

0       2
1       2
2       2
3       2
4       2
5       2
6       2
7       2
8       2
9       2
10      2
11      2
12      2
13      2
14      2
15      2
17      2
18      2
19      2
20      2
21      2
22      2
23      2
24      2
26      2
27      2
29      2
30      2
31      2
32      2
       ..
5968    1
5970    1
5971    1
5972    1
5974    1
5975    1
5976    1
5977    1
5978    1
5979    1
5980    1
5981    1
5982    1
5983    1
5984    1
5985    1
5986    1
5987    1
5988    1
5989    1
5990    1
5991    1
5992    1
5993    1
5994    1
5995    1
5996    1
5997    1
5998    1
5999    1
Name: 0, Length: 5400, dtype: int64

In [117]:
train_indices

[0,
 1,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 104,
 105,
 106,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 144,
 145,
 146,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 162,
 163,
 164,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 194,
 195,
 196,
 197,
 198,
 200,
 201,

In [118]:
ttrain_category

0       2
1       2
3       2
4       2
5       2
6       2
7       2
8       2
9       2
10      2
11      2
12      2
13      2
14      2
15      2
17      2
18      2
19      2
21      2
22      2
23      2
24      2
25      2
26      2
27      2
28      2
29      2
30      2
31      2
32      2
       ..
5967    1
5968    1
5969    1
5970    1
5971    1
5972    1
5973    1
5974    1
5975    1
5976    1
5977    1
5979    1
5980    1
5981    1
5982    1
5983    1
5984    1
5986    1
5987    1
5988    1
5989    1
5990    1
5991    1
5993    1
5994    1
5995    1
5996    1
5997    1
5998    1
5999    1
Name: 0, Length: 5400, dtype: int64

In [122]:
ttrain_body.shape

(5400,)

In [35]:
train_indices

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 11,
 12,
 13,
 14,
 15,
 17,
 18,
 19,
 20,
 22,
 23,
 24,
 25,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 145,
 146,
 147,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 159,
 160,
 161,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197