# Lab 9: Document Analysis

## Load Data

In [1]:
import os   
print(os.getcwd())

d:\Programming\Python_code\PrinciplesOfDS_Course\Labs


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

data = pd.read_csv('./Data/BBC_News_Train.csv')

In [3]:
# Inspect data
print(data.head())
print(data.info())
print(data.shape)

print(1490*.2) # 298 A viable split value 

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB
None
(1490, 3)
298.0


In [4]:
# split data
data_train, data_test = data.iloc[:1191,], data.iloc[1192:,]
print(data_train.shape, data_test.shape) # looks good 

(1191, 3) (298, 3)


## Represent Docs with TF-IDF 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF representation for each document
vectorizer = TfidfVectorizer()
data_train_vectors = vectorizer.fit_transform(data_train.Text)
data_test_vectors = vectorizer.transform(data_test.Text) 

print(data_train_vectors.shape, data_test_vectors.shape) 

(1191, 22453) (298, 22453)


In [8]:
# inspect contents of vectorized documents
print(type(data_train_vectors))
print(len(vectorizer.get_feature_names_out())) # 22,453
print(vectorizer.get_feature_names_out()) 
# cols = terms, rows = docs 


<class 'scipy.sparse._csr.csr_matrix'>
22453
['00' '000' '000bn' ... 'zuluaga' 'zurich' 'zvonareva']


## KNN Document classification: 


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score


Xtr = data_train_vectors
Ytr = data_train.Category

Xte = data_test_vectors
Yte = data_test.Category

k_range = range(1, 5)
param_grid = dict(n_neighbors=k_range)

clf_knn =  KNeighborsClassifier(n_neighbors=1)

grid = GridSearchCV(clf_knn, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

print(grid.best_score_)
print(grid.best_params_)

0.9101613867304245
{'n_neighbors': 4}


In [10]:
from sklearn.metrics import classification_report
# running prediction and inspecting a classification report 
Yte_pred = grid.predict(Xte)
print(classification_report(Yte, Yte_pred))


               precision    recall  f1-score   support

     business       0.94      0.85      0.89        68
entertainment       0.93      0.89      0.91        64
     politics       0.84      0.90      0.87        59
        sport       0.94      0.98      0.96        60
         tech       0.94      0.98      0.96        47

     accuracy                           0.92       298
    macro avg       0.92      0.92      0.92       298
 weighted avg       0.92      0.92      0.92       298



#### Discussion of KNN: 
The data is fairly well balanced, with only techn being substantially lower in the number of samples available. The calssifier is pretty effective, which is in stark contrast to the KNN's results when used on the larger data set in the example provided to us for this lab. 


## Logistic Regression Document Classification

In [15]:
from sklearn.linear_model import LogisticRegression

#=====training with cross validation======
coeff = range(1, 10)
param_grid = dict(C=coeff)

clf_lr = LogisticRegression(penalty='l2')

grid = GridSearchCV(clf_lr, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

print(grid.best_params_)

#=====testing======
clf_lr = LogisticRegression(penalty='l2', C=grid.best_params_['C']) # fitting new model on entire data set after finding best hyperparameter
clf_lr.fit(Xtr, Ytr)

y_pred = clf_lr.predict(Xte)

acc = accuracy_score(Yte, y_pred)
macro_f1 = f1_score(Yte, y_pred, average='macro')
micro_f1 = f1_score(Yte, y_pred, average='micro')

print(acc, macro_f1, micro_f1)

{'C': 8}
0.9731543624161074 0.9722015242878761 0.9731543624161074


In [17]:
# classification report:
Yte_pred = grid.predict(Xte)
print(classification_report(Yte, Yte_pred)) # interesting I can get the same predictions from two differente objects... learn about why 

print(classification_report(Yte, y_pred))

print(type(clf_lr), type(grid))
# <class 'sklearn.linear_model._logistic.LogisticRegression'> <class 'sklearn.model_selection._search.GridSearchCV'>
# ^^ so maybe the GridSearchCV is wrapping the linear model, Not clear on why we see two instances of fitting happening here though. Should ask for explanations.. 

               precision    recall  f1-score   support

     business       0.99      0.99      0.99        68
entertainment       0.98      0.95      0.97        64
     politics       0.96      0.93      0.95        59
        sport       1.00      1.00      1.00        60
         tech       0.92      1.00      0.96        47

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298

               precision    recall  f1-score   support

     business       0.99      0.99      0.99        68
entertainment       0.98      0.95      0.97        64
     politics       0.96      0.93      0.95        59
        sport       1.00      1.00      1.00        60
         tech       0.92      1.00      0.96        47

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298

<cla

## Kmeans Clustering for Document Classification

In [28]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score

#=====training with cross validation======
n_clusters = [5]
param_grid = [dict(n_clusters=n_clusters), dict(n_init=[5,10,20,30]), dict(init=['k-means++', 'random'])]

clf_kmeans = KMeans()

grid = GridSearchCV(clf_kmeans, param_grid, cv=5, scoring='normalized_mutual_info_score')
grid.fit(Xtr, Ytr)

print(grid.best_params_)





{'n_init': 30}


In [31]:
#=====testing======
y_pred = grid.predict(Xte)
# clf_kmeans = KMeans(n_clusters=grid.best_params_['n_clusters'], n_init=15)
# clf_kmeans.fit(Xtr, Ytr)

# y_pred = clf_kmeans.predict(Xte)

# acc = accuracy_score(Yte, y_pred)
# macro_f1 = f1_score(Yte, y_pred, average='macro')
# micro_f1 = f1_score(Yte, y_pred, average='micro')

# print(acc, macro_f1, micro_f1)

In [33]:
print(grid.best_score_, grid.best_params_)
print(y_pred)


0.6139903533390151 {'n_init': 30}
[1 4 1 1 6 1 4 2 2 3 7 5 1 2 5 0 2 1 1 6 4 3 0 0 4 3 0 4 2 3 5 3 1 7 1 3 4
 4 4 3 5 1 2 7 2 4 3 3 2 4 0 1 2 7 1 0 1 1 1 2 2 0 1 2 7 1 2 6 1 2 1 3 1 2
 2 3 5 1 2 4 7 1 1 1 7 1 3 1 3 1 1 1 2 1 3 5 1 2 1 5 1 3 7 4 5 1 1 1 3 0 1
 3 1 5 1 0 5 1 4 5 1 3 2 3 5 1 3 5 5 0 0 2 1 4 7 4 1 0 1 6 1 4 1 3 2 1 0 1
 2 1 4 3 6 1 1 1 3 2 4 1 2 2 5 3 1 0 4 2 2 1 6 5 1 2 7 5 1 1 5 5 3 1 1 1 1
 0 2 0 0 2 1 7 5 7 3 1 5 1 2 4 6 2 4 1 3 6 2 1 1 3 4 2 1 0 2 4 1 2 4 0 2 3
 2 1 0 7 5 2 2 2 1 0 7 4 2 1 3 1 4 1 0 4 2 2 1 6 5 6 3 6 2 3 2 1 0 1 3 0 1
 1 0 2 2 5 1 4 7 2 2 5 1 1 5 5 7 3 3 5 7 2 2 3 1 0 4 5 1 1 1 3 5 1 1 2 2 1
 6 5]


In [30]:
print(dir(grid))
print(grid.get_params)

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_n_features', '_check_refit_for_multimetric', '_estimator_type', '_format_results', '_get_param_names', '_get_tags', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_run_search', '_select_best_index', '_validate_data', '_validate_params', 'best_estimator_', 'best_index_', 'best_params_', 'best_score_', 'classes_', 'cv', 'cv_results_', 'decision_function', 'error_score', 'estimator', 'fit', 'get_params', 'inverse_transform', 'multimetric_', 'n_features_in_', 'n_jobs', 'n_splits_', 'param_grid', 'pre_dispa

In [41]:
# Having trouble with gridSearchCV for KMeans, going to go for manual approach now
test_k = [4,5,6]
test_n_init = [20, 30, 50, 100]
test_init = ['k-means++', 'random']

best_nmi = 0
best_k = None
best_n_init = None
best_init = None

for k in test_k:
    for n in test_n_init:
        for init in test_init:
            temp_cluster = KMeans(n_clusters=k, init=init, n_init=n).fit(Xtr)
            temp_nmi = normalized_mutual_info_score(temp_cluster.labels_, Ytr, average_method='arithmetic')
            print(f"NMI for n_clusters = {k}, init = {init}, n_init = {n} ==> {temp_nmi}")
            if(temp_nmi > best_nmi):
                best_nmi = temp_nmi
                best_k = k
                best_n_init = n
                best_init = init

print("Best of each category:\nBest nmi {}\nBest k cluster {}\nBest n_init {}\nBest init {}".format(best_nmi, best_k, best_n_init, best_init))

NMI for n_clusters = 4, init = k-means++, n_init = 20 ==> 0.5963024728506824
NMI for n_clusters = 4, init = random, n_init = 20 ==> 0.6694490003239375
NMI for n_clusters = 4, init = k-means++, n_init = 30 ==> 0.5721736023635929
NMI for n_clusters = 4, init = random, n_init = 30 ==> 0.5816922143139323
NMI for n_clusters = 4, init = k-means++, n_init = 50 ==> 0.6313921886154923
NMI for n_clusters = 4, init = random, n_init = 50 ==> 0.6190349706617118
NMI for n_clusters = 4, init = k-means++, n_init = 100 ==> 0.6254957403648274
NMI for n_clusters = 4, init = random, n_init = 100 ==> 0.6953031093168366
NMI for n_clusters = 5, init = k-means++, n_init = 20 ==> 0.6912449958227307
NMI for n_clusters = 5, init = random, n_init = 20 ==> 0.6112186968478724
NMI for n_clusters = 5, init = k-means++, n_init = 30 ==> 0.6548555410504593
NMI for n_clusters = 5, init = random, n_init = 30 ==> 0.6262900231726729
NMI for n_clusters = 5, init = k-means++, n_init = 50 ==> 0.629081225377893
NMI for n_cluste

KMeans seems quite sensitive to the initialization state, I found an NMI of .82 on 5 clusters once with 50 = n_init, though its not consistantly able to produce that high result. So higher n_init == better outcome here. 

In [44]:
# training KMeans after finding best params:
km_cluster = KMeans(n_clusters=5, init='k-means++', n_init=1000).fit(Xtr)
km_nmi = normalized_mutual_info_score(km_cluster.labels_, Ytr, average_method='arithmetic')
print(km_nmi)

0.7568773989425172


In [52]:
# Find top 10 representative words per cluster: 
centroids = km_cluster.cluster_centers_
feature_names = vectorizer.get_feature_names_out()

for i in range(5):
    indx = centroids[i].argsort()[-10:][::-1]
    top_features = [feature_names[index] for index in indx]
    print(f"Cluster {i+1}: {top_features}")

<class 'numpy.ndarray'> ['00' '000' '000bn' ... 'zuluaga' 'zurich' 'zvonareva']
Cluster 1: ['the', 'to', 'of', 'and', 'in', 'that', 'is', 'it', 'are', 'for']
Cluster 2: ['the', 'and', 'in', 'of', 'to', 'film', 'for', 'best', 'on', 'was']
Cluster 3: ['the', 'to', 'in', 'of', 'and', 'said', 'its', 'that', 'it', 'for']
Cluster 4: ['the', 'to', 'of', 'mr', 'and', 'he', 'labour', 'in', 'election', 'blair']
Cluster 5: ['the', 'to', 'in', 'and', 'of', 'he', 'his', 'we', 'but', 'it']


In [None]:
print(type(centroids))

for i in range(5):
    indx = centroids[i].argsort()[:10]
    top_features = [feature_names[index] for index in indx]
    print(f"Cluster {i+1}: {top_features}")
    print(f"Values at indices: {centroids[i][indx]}")

In [49]:
# predicting:
km_y_pred = km_cluster.predict(Xte)
print(km_y_pred)

[1 1 0 2 0 2 1 4 4 4 2 0 2 1 0 3 4 0 0 0 1 4 3 3 1 4 3 1 1 4 0 4 2 2 2 4 1
 1 1 4 0 1 1 3 1 1 4 4 4 1 3 2 4 2 0 3 2 2 2 4 4 3 2 1 2 2 3 0 2 1 3 4 2 0
 4 4 0 2 1 1 2 3 2 4 2 2 4 0 4 3 2 2 1 0 4 0 2 1 2 0 0 4 2 1 0 2 3 2 4 3 0
 4 0 0 2 3 0 2 1 0 2 4 1 4 0 0 4 0 0 3 3 4 3 1 2 1 0 3 4 0 2 1 2 4 1 0 3 2
 1 2 1 4 0 2 2 4 4 1 1 2 1 1 0 4 0 3 1 4 3 0 0 0 0 3 2 0 2 2 0 0 4 2 2 0 2
 3 4 3 3 1 2 2 0 2 4 3 0 2 1 1 0 1 1 2 4 0 2 2 2 4 1 4 2 3 4 1 3 0 1 3 1 4
 1 0 3 2 0 1 4 4 0 3 2 1 1 2 4 3 1 2 3 1 4 4 3 0 0 0 4 0 1 4 1 2 3 2 4 3 2
 2 3 1 3 0 3 1 2 4 0 0 1 3 0 0 2 4 4 0 2 4 0 4 2 3 1 0 3 2 2 4 0 1 2 1 1 2
 0 0]


In [48]:
a = [4,6,7,1,23,7,82,1,28,28,6,26]
print(a)
print(a[-10:])
print(a[10:])

print(a[-10:][::-1]) # from the back, 10 in, then reverse the order of the result. 

print(a[-10::-1]) # so from the back, 10 in, to the end, but reverse the direction, so to the start 

[4, 6, 7, 1, 23, 7, 82, 1, 28, 28, 6, 26]
[7, 1, 23, 7, 82, 1, 28, 28, 6, 26]
[6, 26]
[26, 6, 28, 28, 1, 82, 7, 23, 1, 7]
[7, 6, 4]
