In [1]:
#imports
#from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np
from sklearn.model_selection import KFold, train_test_split, cross_val_score
import os
import glob
import pandas as pd
import string
from sklearn.linear_model import LinearRegression, LogisticRegression
from numpy import mean, absolute, sqrt
import random

gnb = GaussianNB()
mnb = MultinomialNB(alpha=1.0)

In [2]:
#open data, view as a list of strings, remove stop words
directory = 'dataset-news'
trainingListSW = []
with open('stop_words_english.txt','r') as stopWordsFile:
    stopWords = stopWordsFile.read().splitlines()
    stopWords = [word.translate(str.maketrans('','',string.punctuation)) for word in stopWords]
    #a_string.translate(str.maketrans('', '', string.punctuation))

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    with open(f) as f_input:
        fSplit = str(f_input.read())
        rWords = [w for w in fSplit.split() if w.translate(str.maketrans('','',string.punctuation)).lower() not in stopWords]
        result = ' '.join(rWords)    
        trainingListSW.append(result)

In [3]:
#Vectorize
count_vect = CountVectorizer()
XtrainCountsSW = count_vect.fit_transform(trainingListSW)
XtrainCountsSW.shape

(800, 14755)

In [4]:
#create the y-target array
tlTarget = []
tlTargetDict = {'aut':0,'bas':1,'ele':2,'hoc':3,'ibm':4,'mac':5,'mot':6,'pol':7}
#auto, baseball, electronics, hockey, ibm, mac, moto, guns. 0,1,2,3,4,5,6,7

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    tlTarget.append( tlTargetDict[filename[:3] ] )

In [6]:
#gaussian naive bayes
XTrainSW = XtrainCountsSW.todense()

gnb.fit(XTrainSW, tlTarget)

y_predGNB_SW = gnb.predict(XTrainSW)
print ("Number of mislabeled points out of a total %d points : %d" % \
       (XTrainSW.shape[0],(tlTarget != y_predGNB_SW).sum()))

Number of mislabeled points out of a total 800 points : 27


In [7]:
#multinomial naive bayes
mnb.fit(XtrainCountsSW, tlTarget)

y_predMNB_SW = mnb.predict(XtrainCountsSW)

print ("Number of mislabeled points out of a total %d points : %d" % \
       (XtrainCountsSW.shape[0],(tlTarget != y_predMNB_SW).sum()))

Number of mislabeled points out of a total 800 points : 35


In [8]:
#10 folds KFold
kf = KFold(n_splits=10)

#x is XtrainCountsSW
#y is tlTarget

#XTrain

counter = 1
for train, test in kf.split(XtrainCountsSW):
    print ("%d Fold" % counter)
    print ("%s %s" % (train, test))
    #x_train, x_test = XtrainCountsSW[train], train[test]
    #y_train, y_test = tlTarget[train], tlTarget[test]
    print (XtrainCountsSW[train])
    print (XtrainCountsSW[test])
    counter += 1

1 Fold
[ 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97
  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
 314 315 316 317 318 319 320 321 322 323 324

In [9]:
#scores and average score (accuracy) 
scoresGNB = cross_val_score(gnb, XTrainSW, tlTarget, scoring='accuracy', cv=kf, n_jobs=-1)
scoresMNB = cross_val_score(mnb, XtrainCountsSW, tlTarget, scoring='accuracy', cv=kf, n_jobs=-1)
#trainingListSW

print('GNB scores: %s \nGNB average score: %s \nMNB scores: %s \nMNB average score: %s' % \
      (scoresGNB, mean(scoresGNB), scoresMNB, mean(scoresMNB)))

GNB scores: [0.1125 0.4375 0.4625 0.4625 0.2375 0.075  0.375  0.275  0.3125 0.1125] 
GNB average score: 0.28625 
MNB scores: [0.0375 0.3875 0.425  0.3875 0.1    0.025  0.375  0.35   0.325  0.175 ] 
MNB average score: 0.25875


In [36]:
#random classifier

rAssign = []
#[word.translate(str.maketrans('','',string.punctuation)) for word in stopWords]
#rWords = [w for w in fSplit.split() if w.translate(str.maketrans('','',string.punctuation)).lower() not in stopWords]
c1 = 0
accuracy = 0
for item in trainingListSW:
    rAssign.append( random.randrange(0,7) )
    if rAssign[c1] == tlTarget[c1]:
        accuracy += 1
    c1 += 1

print ("Number of mislabeled points out of a total %d points : %d" % \
       ( len( trainingListSW ), len( trainingListSW ) - accuracy ) )
print('random accuracy: ', accuracy / len( trainingListSW ) )


Number of mislabeled points out of a total 800 points : 696
random accuracy:  0.13


In [10]:
#open data, view as a list of strings, DONT remove stop words
trainingListNSW = []

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    with open(f) as f_input:
        trainingListNSW.append(f_input.read())
    
#vectorize
count_vect = CountVectorizer()
XtrainCountsNSW = count_vect.fit_transform(trainingListNSW)
XtrainCountsNSW.shape

(800, 14842)

In [11]:
#gnb (no stop words)
XTrainNSW = XtrainCountsNSW.todense()

gnb.fit(XTrainNSW, tlTarget)

y_predGNB_NSW = gnb.predict(XTrainNSW)
print ("Number of mislabeled points out of a total %d points : %d" % \
(XTrainNSW.shape[0],(tlTarget != y_predGNB_NSW).sum()))

Number of mislabeled points out of a total 800 points : 27


In [13]:
#mnb (no stop words)
mnb.fit(XtrainCountsNSW, tlTarget)

y_predMNB_NSW = mnb.predict(XtrainCountsNSW)

print ("Number of mislabeled points out of a total %d points : %d" % \
       (XtrainCountsNSW.shape[0],(tlTarget != y_predMNB_NSW).sum()))

Number of mislabeled points out of a total 800 points : 37


In [14]:
#kfolds again (no stop words)
counter = 1
for train, test in kf.split(XtrainCountsNSW):
    print ("%d Fold" % counter)
    print ("%s %s" % (train, test))
    #x_train, x_test = XtrainCountsSW[train], train[test]
    #y_train, y_test = tlTarget[train], tlTarget[test]
    print (XtrainCountsNSW[train])
    print (XtrainCountsNSW[test])
    counter += 1

1 Fold
[ 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97
  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
 314 315 316 317 318 319 320 321 322 323 324

In [16]:
#scores and average score (accuracy) again (no stop words)
scoresGNBNSW = cross_val_score(gnb, XTrainNSW, tlTarget, scoring='accuracy', cv=kf, n_jobs=-1)
scoresMNBNSW = cross_val_score(mnb, XtrainCountsNSW, tlTarget, scoring='accuracy', cv=kf, n_jobs=-1)

print('GNB scores: %s \nGNB average score: %s \nMNB scores: %s \nMNB average score: %s' % \
      (scoresGNBNSW, mean(scoresGNBNSW), scoresMNBNSW, mean(scoresMNBNSW)))

GNB scores: [0.1125 0.425  0.4625 0.4625 0.2625 0.075  0.3625 0.275  0.3    0.1125] 
GNB average score: 0.285 
MNB scores: [0.0125 0.175  0.125  0.375  0.0375 0.     0.2125 0.0875 0.2    0.025 ] 
MNB average score: 0.12499999999999997


In [18]:
#part D, task 1: guns hockey mac
#tlTargetDictPdT1 = {'aut':0,'bas':1,'ele':2,'hoc':3,'ibm':4,'mac':5,'mot':6,'pol':7}
tlTargetDictPdT1 = {'hoc':0,'mac':1,'pol':2}
tlTargetPdT1 = []
directory = 'dataset-news'
trainingListPdT1 = []
#stopWordsFile, stopWords

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if filename[:3] in tlTargetDictPdT1:
        tlTargetPdT1.append( tlTargetDictPdT1[filename[:3] ] ) 
    else:
        continue
    with open(f) as f_input:
        fSplit = str(f_input.read())
        rWords = [w for w in fSplit.split() if w.translate(str.maketrans('','',string.punctuation)).lower() not in stopWords]
        result = ' '.join(rWords)    
        trainingListPdT1.append(result)  

#count_vect = CountVectorizer()
XtrainCountsPdT1 = count_vect.fit_transform(trainingListPdT1)
#XtrainCountsPdT1.shape

XTrainPdT1 = XtrainCountsPdT1.todense()
gnb.fit(XTrainPdT1, tlTargetPdT1)
yPdT1 = gnb.predict(XTrainPdT1)
#y_pred = mnb.predict(iris.data)

print ("Number of mislabeled points out of a total %d points : %d" % \
       (XTrainPdT1.shape[0],(tlTargetPdT1 != yPdT1).sum())) 

scoresPdT1 = cross_val_score(gnb, XTrainPdT1, yPdT1, scoring='accuracy', cv=kf, n_jobs=-1)

print('MNB scores: %s \nMNB average score: %s' % \
      (scoresPdT1, mean(scoresPdT1)))

Number of mislabeled points out of a total 300 points : 4
MNB scores: [0.96666667 0.96666667 0.83333333 0.9        0.86666667 0.86666667
 0.96666667 0.96666667 0.83333333 0.8       ] 
MNB average score: 0.8966666666666668


In [19]:
#part D, task 2: mac, ibm, elec 
#tlTargetDictPdT1 = {'aut':0,'bas':1,'ele':2,'hoc':3,'ibm':4,'mac':5,'mot':6,'pol':7}
tlTargetDictPdT2 = {'ibm':0,'mac':1,'ele':2}
tlTargetPdT2 = []
directory = 'dataset-news'
trainingListPdT2 = []
#stopWordsFile, stopWords

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if filename[:3] in tlTargetDictPdT2:
        tlTargetPdT2.append( tlTargetDictPdT2[filename[:3] ] ) 
    else:
        continue
    with open(f) as f_input:
        fSplit = str(f_input.read())
        rWords = [w for w in fSplit.split() if w.translate(str.maketrans('','',string.punctuation)).lower() not in stopWords]
        result = ' '.join(rWords)    
        trainingListPdT2.append(result)  

#count_vect = CountVectorizer()
XtrainCountsPdT2 = count_vect.fit_transform(trainingListPdT2)
#XtrainCountsPdT1.shape
        
XTrainPdT2 = XtrainCountsPdT2.todense()    
mnb.fit(XTrainPdT2, tlTargetPdT2)
yPdT2 = mnb.predict(XTrainPdT2)
#y_pred = mnb.predict(iris.data)

print ("Number of mislabeled points out of a total %d points : %d" % \
       (XTrainPdT2.shape[0],(tlTargetPdT2 != yPdT2).sum())) 

scoresPdT2 = cross_val_score(mnb, XTrainPdT2, yPdT2, scoring='accuracy', cv=kf, n_jobs=-1)

print('MNB scores: %s \nMNB average score: %s' % \
      (scoresPdT2, mean(scoresPdT2)))

Number of mislabeled points out of a total 300 points : 17
MNB scores: [0.63333333 0.6        0.36666667 0.6        0.7        0.73333333
 0.76666667 0.73333333 0.66666667 0.8       ] 
MNB average score: 0.6599999999999999


Answers:
a) Gaussian average accuracy: 28.625%. Multinomial average accuracy: 25.875%
b) Since we have 8 topics, it'll be ~12.5%. My calculated value using randrange resulted in 13% which is reasonable (though this will change each time the program is run due to my implementation). Therefore, the gnb and mnb methods are more than doubly accurate than random selection
c) After removing stop words, accuracy worsens only slightly to 28.5% for Gaussian and worsens substantially to 12.5% for Multinomial. This is likely because the addition of stop words that do not carry any additional meaning distracts the classifier with additional noise, reducing its effectiveness as it attempts to change the prediction such that it makes meaning out of meaningless information. 
d) Guns + hockey + Mac results in a significantly more accurate prediction (89.7%) as opposed to the mac + IBM + electronics (66%). This is likely because guns, hockey, and mac are entirely different subjects which will have much more distinct vocabulary in their respective articles. For instance, words like "rink", "skate", and "penalty" will appear in almost every hockey article while rarely appearing in the other two. The same can be said about specific vocabulary applied to the other categories. Such distinct categories are ideal for the model as it makes the difference in vectors much more distinct, allowing the model to more easily fit the data. Meanwhile, mac and IBM both refer to companies and products relating to computers, while computers are a category of the broad topic of electronics. It's even possible that articles in each of those categories not only use exactly the same vocabulary, but even words like "mac" might appear in IBM and electronics articles (though likely with less frequency). This makes separating the categories much more difficult for the model, as the data will be much more clustered and close together with more frequent overlaps across categories.  
