# Lab 9: Document Analysis

## Load Data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups

data_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
data_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

print("Train data target labels: {}".format(data_train.target))
print("Train data target names: {}".format(data_train.target_names))

print('#training samples: {}'.format(len(data_train.data)))
print('#testing samples: {}'.format(len(data_test.data)))

Train data target labels: [7 4 4 ... 3 1 8]
Train data target names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
#training samples: 11314
#testing samples: 7532


In [3]:
import pandas as pd
print(set(data_train.target)) # 20 categories
print(set(data_train.target_names))
print(len(set(data_train.target_names))) # 20 here as well. 

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
{'sci.space', 'soc.religion.christian', 'sci.med', 'sci.crypt', 'talk.politics.guns', 'comp.os.ms-windows.misc', 'comp.graphics', 'misc.forsale', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.motorcycles', 'talk.religion.misc', 'comp.windows.x', 'alt.atheism', 'talk.politics.misc', 'rec.sport.baseball', 'sci.electronics', 'talk.politics.mideast', 'rec.autos', 'rec.sport.hockey'}
20


## Represent Docs with TF-IDF 


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler


#TF-IDF representation for each document
vectorizer = TfidfVectorizer()
data_train_vectors = vectorizer.fit_transform(data_train.data)
data_test_vectors = vectorizer.transform(data_test.data) 

print(data_train_vectors.shape, data_test_vectors.shape)

(11314, 101631) (7532, 101631)


In [8]:
print(len(data_train.data))

print(type(data_train.data))


11314
<class 'list'>


## KNN Document classification: 


In [None]:
# inspect contents of vectorized documents
print(type(data_train_vectors))
#print(vectorizer.get_feature_names_out()[:10])

# for t in range(len(vectorizer.get_feature_names_out())):
#     e = 10*t
#     s = e-10
#     print(vectorizer.get_feature_names_out()[s:e]) # lots and lots of terms
#print(data_train_vectors)

print(len(vectorizer.get_feature_names_out())) # 101,631 unique terms it seems. 

# so the matrix is cols as terms, and rows as documents... the values are TF-IDF = TF x IDF == (#term t in doc d / # words in doc d)   x   (#docs / #docs containg term t) ... so always between 0-1 ... 1 only occurring when term t is the ONLY word in a given document AND the word appears in every other document in the corpus. 


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score


Xtr = data_train_vectors
Ytr = data_train.target

Xte = data_test_vectors
Yte = data_test.target

k_range = range(1, 5)
param_grid = dict(n_neighbors=k_range)

clf_knn =  KNeighborsClassifier(n_neighbors=1)

grid = GridSearchCV(clf_knn, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

print(grid.best_score_)
print(grid.best_params_)

In [None]:
from sklearn.metrics import classification_report
print(param_grid)

# running prediction and seeing inspecting a classification report. 
Yte_pred = grid.predict(Xte)
print(classification_report(Yte, Yte_pred))



Above in the classification report, it can be seen the data is fairly well balanced, and that overall the prediction accuracy is pretty bad. Precision doesn't look bad at a glance, however this is only because there are so many different classes in this case I believe, really reducing the chances of predicting a false positive at all. Thusly we see an inverse appearance of quality in recall, as taking false negatives into account with many classes produces the oposite result, which is that many true positives are put into other classes and thus the score is quite low. 


## Logistic Regression Document Classification
#### after 10 mins the regression was still running.. getting warnings about not converging.. not sure why its taking so long with 10+ warnings as we are... ohh maybe we are testing 10x5 = 50 reg coefficients x cross validations.. thats why it was taking so long. 


In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

#=====training with cross validation======
coeff = range(1, 10)
param_grid = dict(C=coeff)

clf_lr = LogisticRegression(penalty='l2')

grid = GridSearchCV(clf_lr, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

print(grid.best_params_)

#=====testing======
clf_lr = LogisticRegression(penalty='l2', C=grid.best_params_['C'])
clf_lr.fit(Xtr, Ytr)

y_pred = clf_lr.predict(Xte)

acc = accuracy_score(Yte, y_pred)
macro_f1 = f1_score(Yte, y_pred, average='macro')
micro_f1 = f1_score(Yte, y_pred, average='micro')

print(acc, macro_f1, micro_f1)