# Document Clustering - Giannoulopoulos George
### Write a program which will parse, analyse and classify the items of the dataset into N (e.g 10) groups based on their text content

In order to use clustering on the documents the features need to be extracted. The algorithm to do that is the TF-IDF Vectorizer. TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most frequent words to features indices and hence compute a word occurrence frequency (sparse) matrix. The word frequencies are then reweighted using the Inverse Document Frequency (IDF) vector collected feature-wise over the documents. The clustering algorithm to be used is K-means.

In [1]:
# !pip install et-xmlfile
# !pip freeze

In [2]:
import xml.etree.ElementTree as ET
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [3]:
# Parse the xml file
xml = ET.parse('data.source.rss-feeds.xml')  
xml_root = xml.getroot()


def strip_html(s):
    """
    (str)->(str)
    
    Remove any html tags from the text using regex. Also remove the extra whitespaces
    and the newline escape sequence characters
    """
    no_html = re.sub("(<.*?>)", " ",s) # Remove html tags
    pretty = re.sub('(\s+)|(\n)', " ", no_html) # Remove extra whitespace and \n
    
    return  pretty          


doc_list=[]
for element in xml_root[0].findall('item'):
    try:    
        title = element[0].text
        text = strip_html(element[2].text)
        doc_list.append(title+' '+text)
    except(TypeError):
    # print('TypeError exception')
        pass

In [4]:
CLUSTER_NUMBER=3

In [5]:
def tfid_construct(documents):
    """
    (list of str)->(sparse matrix, list of str )
    
    Construct the document-term matrix
    """
    vectorizer = TfidfVectorizer(stop_words='english').fit(documents) # Learn vocabulary and idf from training set.
    features_names = vectorizer.get_feature_names() # Get the feature names
    tfid_vector = vectorizer.transform(documents) # Transform documents to document-term matrix.
    
    return tfid_vector, features_names



def kmeans_train(tfid_vector, n=10):
    """
    (sparse matrix, int)->(ndarray, ndarray)
    
    Train the kmeans model on the train matrix and get the centroids and labels.
    """
    kmeans= KMeans(n_clusters=n).fit(tfid_vector)
    kmeans_centroids = kmeans.cluster_centers_
    kmeans_labels = kmeans.labels_
    
    return kmeans_centroids, kmeans_labels



def top_words(cent, feat, n=10, word_number=6):
    """
    (ndarray, list of str)->()
    
    Print the top words for each cluster
    """
    
    sorted_centroid_indexes = cent.argsort()[:, ::-1]
    for i in range(n):
        print("Cluster %d:" % (i+1), end=' ')

        for j in sorted_centroid_indexes[i, :word_number]:
            print(' %s' % feat[j], end=' ')
        print('\n')
        
        
        
vector, features = tfid_construct(doc_list)
centroids, labels = kmeans_train(vector, n=CLUSTER_NUMBER)
top_words(centroids, features, n=CLUSTER_NUMBER)

Cluster 1:  nbsp  graph  image  title  amp  countries 

Cluster 2:  employment  education  european  2016  labour  2017 

Cluster 3:  skills  labour  market  skill  cedefop  jobs 



In [6]:
def cluster_size(labels_array,n=10):
    """
    (ndarray)->()
    
    Print the number of items in each cluster
    """
    # Create the dictionary to hold the items of each cluster
    cluster_dict={}
    for cl in range(n):
        cluster_dict[cl] = 0
        
    for label in labels:
        cluster_dict[label] = cluster_dict.get(label) + 1

    for i in range(n):
        print('Cluster %d: %d documents' % (i+1, cluster_dict.get(i)))
        
        
cluster_size(labels, n=CLUSTER_NUMBER)

Cluster 1: 84 documents
Cluster 2: 1188 documents
Cluster 3: 385 documents
