# Lab 9: Document Analysis

## Load Data

In [1]:
import os   
print(os.getcwd())

d:\Programming\Python_code\PrinciplesOfDS_Course\Labs


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

data = pd.read_csv('./Data/BBC_News_Train.csv')

In [3]:
# Inspect data
print(data.head())
print(data.info())
print(data.shape)

print(1490*.2) # 298 A viable split value 

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB
None
(1490, 3)
298.0


In [4]:
# split data
data_train, data_test = data.iloc[:1191,], data.iloc[1192:,]
print(data_train.shape, data_test.shape) # looks good 

(1191, 3) (298, 3)


## Represent Docs with TF-IDF 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF representation for each document
vectorizer = TfidfVectorizer()
data_train_vectors = vectorizer.fit_transform(data_train.Text)
data_test_vectors = vectorizer.transform(data_test.Text) 

print(data_train_vectors.shape, data_test_vectors.shape) 

(1191, 22453) (298, 22453)


In [8]:
# inspect contents of vectorized documents
print(type(data_train_vectors))
print(len(vectorizer.get_feature_names_out())) # 22,453
print(vectorizer.get_feature_names_out()) 
# cols = terms, rows = docs 


<class 'scipy.sparse._csr.csr_matrix'>
22453
['00' '000' '000bn' ... 'zuluaga' 'zurich' 'zvonareva']


## KNN Document classification: 


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score


Xtr = data_train_vectors
Ytr = data_train.Category

Xte = data_test_vectors
Yte = data_test.Category

k_range = range(1, 5)
param_grid = dict(n_neighbors=k_range)

clf_knn =  KNeighborsClassifier(n_neighbors=1)

grid = GridSearchCV(clf_knn, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

print(grid.best_score_)
print(grid.best_params_)

0.9101613867304245
{'n_neighbors': 4}


In [10]:
from sklearn.metrics import classification_report
# running prediction and inspecting a classification report 
Yte_pred = grid.predict(Xte)
print(classification_report(Yte, Yte_pred))


               precision    recall  f1-score   support

     business       0.94      0.85      0.89        68
entertainment       0.93      0.89      0.91        64
     politics       0.84      0.90      0.87        59
        sport       0.94      0.98      0.96        60
         tech       0.94      0.98      0.96        47

     accuracy                           0.92       298
    macro avg       0.92      0.92      0.92       298
 weighted avg       0.92      0.92      0.92       298



#### Discussion of KNN: 
The data is fairly well balanced, with only techn being substantially lower in the number of samples available. The calssifier is pretty effective, which is in stark contrast to the KNN's results when used on the larger data set in the example provided to us for this lab. 


## Logistic Regression Document Classification

In [53]:
from sklearn.linear_model import LogisticRegression

#=====training with cross validation======
coeff = range(1, 10)
param_grid = dict(C=coeff)

clf_lr = LogisticRegression(penalty='l2')

grid = GridSearchCV(clf_lr, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

print(grid.best_params_)

#=====testing======
y_pred = grid.predict(Xte) # predicting with grid uses its best_estimator 

acc = accuracy_score(Yte, y_pred)
macro_f1 = f1_score(Yte, y_pred, average='macro')
micro_f1 = f1_score(Yte, y_pred, average='micro')

print(acc, macro_f1, micro_f1)

{'C': 8}
0.9731543624161074 0.9722015242878761 0.9731543624161074


In [54]:
# classification report:
print(classification_report(Yte, y_pred))

               precision    recall  f1-score   support

     business       0.99      0.99      0.99        68
entertainment       0.98      0.95      0.97        64
     politics       0.96      0.93      0.95        59
        sport       1.00      1.00      1.00        60
         tech       0.92      1.00      0.96        47

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298



## Kmeans Clustering for Document Classification

In [55]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score

In [56]:
# Manual search for best hyperparameters using Kmeans: 
test_k = [4,5,6]
test_n_init = [20, 30, 50, 100]
test_init = ['k-means++', 'random']

best_nmi = 0
best_k = None
best_n_init = None
best_init = None

for k in test_k:
    for n in test_n_init:
        for init in test_init:
            temp_cluster = KMeans(n_clusters=k, init=init, n_init=n).fit(Xtr)
            temp_nmi = normalized_mutual_info_score(temp_cluster.labels_, Ytr, average_method='arithmetic')
            print(f"NMI for n_clusters = {k}, init = {init}, n_init = {n} ==> {temp_nmi}")
            if(temp_nmi > best_nmi):
                best_nmi = temp_nmi
                best_k = k
                best_n_init = n
                best_init = init

print("Best of each category:\nBest nmi {}\nBest k cluster {}\nBest n_init {}\nBest init {}".format(best_nmi, best_k, best_n_init, best_init))

NMI for n_clusters = 4, init = k-means++, n_init = 20 ==> 0.46109288716601216
NMI for n_clusters = 4, init = random, n_init = 20 ==> 0.5875981209449965
NMI for n_clusters = 4, init = k-means++, n_init = 30 ==> 0.7302443838730827
NMI for n_clusters = 4, init = random, n_init = 30 ==> 0.6756584824765176
NMI for n_clusters = 4, init = k-means++, n_init = 50 ==> 0.6695005818206236
NMI for n_clusters = 4, init = random, n_init = 50 ==> 0.61130035211045
NMI for n_clusters = 4, init = k-means++, n_init = 100 ==> 0.558337446537098
NMI for n_clusters = 4, init = random, n_init = 100 ==> 0.63475405486156
NMI for n_clusters = 5, init = k-means++, n_init = 20 ==> 0.7411070400696318
NMI for n_clusters = 5, init = random, n_init = 20 ==> 0.5583172344880103
NMI for n_clusters = 5, init = k-means++, n_init = 30 ==> 0.5762626722615242
NMI for n_clusters = 5, init = random, n_init = 30 ==> 0.6203761908596722
NMI for n_clusters = 5, init = k-means++, n_init = 50 ==> 0.6664425704257937
NMI for n_clusters 

KMeans seems quite sensitive to the initialization state, I found an NMI of .82 on 5 clusters once with 50 = n_init, though its not consistantly able to produce that high result. So higher n_init == better outcome here. 

In [66]:
# training KMeans after finding best params (higher n_init seems necessary to have higher probability of a good result):
km_cluster = KMeans(n_clusters=5, init='k-means++', n_init=1000).fit(Xtr)
km_nmi = normalized_mutual_info_score(km_cluster.labels_, Ytr, average_method='arithmetic')
print(km_nmi)

0.7742912350393736


In [67]:
# Find top 10 representative words per cluster: 
centroids = km_cluster.cluster_centers_
feature_names = vectorizer.get_feature_names_out()

for i in range(5):
    indx = centroids[i].argsort()[-10:][::-1]
    top_features = [feature_names[index] for index in indx]
    print(f"Cluster {i+1}: {top_features}")
    #print(f"Values at indices: {centroids[i][indx]}")

Cluster 1: ['the', 'and', 'to', 'in', 'film', 'of', 'best', 'for', 'on', 'was']
Cluster 2: ['the', 'to', 'of', 'and', 'mr', 'he', 'in', 'labour', 'said', 'election']
Cluster 3: ['the', 'to', 'of', 'and', 'in', 'that', 'is', 'it', 'are', 'mobile']
Cluster 4: ['the', 'to', 'in', 'and', 'of', 'he', 'his', 'we', 'but', 'it']
Cluster 5: ['the', 'to', 'of', 'in', 'and', 'said', 'its', 'that', 'it', 'for']
