# Data clening

In [2]:
import json
with open("delfi.json", "r") as read_file:
    data = json.load(read_file)

In [3]:
cleaned_data = []
for d in data:
    if d["categorys"] in ("projektai", "m360") or len(d["text"]) < 1000:# d["text"] == "": #len(d["text"]) < 1000:
        continue
    elif d["categorys"] == 'sportas':
        d["categorys"] = 'Sportas'
    elif d["categorys"][0].startswith('DELFI '):
        d["categorys"] = d["categorys"][0][6:]
    elif isinstance(d["categorys"], list):
        d["categorys"] = d["categorys"][0]
    cleaned_data.append(d)
cleaned_data = [d for d in cleaned_data if d["categorys"] in ("Verslas", "Mokslas", "Veidai", "Auto", "Sportas")]
print("From {} to {}.".format(len(data), len(cleaned_data)))

From 5233 to 4058.


# Preprocessing

In [4]:
%%time
import re
import subprocess

num_tok, stop_tokens, num_stems = [], [], []

text_file = open("Lithuanian stop words", "r")
stopwords = text_file.read().split("\n")

for d in cleaned_data:#log_progress(cleaned_data):
    
    # if intro in bigger then text
    if len(d['text']) < len(d['intro']):
        print(d['text'] + '\n' + len(d['intro']))
    
    # tokenize & lowercase
    tokens = re.sub("[\W\d_]+", " ", d["text"]).lower().split() # ka daryti su 1992-ųjų, romėniškais skaičiais, 2 mln. eur\
    num_tok += [len(tokens)]
    
    # remove stop words
    new_tokens = [words for words in tokens if words not in stopwords]
    stop_tokens += [len(tokens) - len(new_tokens)]
    
    # steam
    with open("tokens.txt", "w") as token_file:
        token_file.write("\n".join(new_tokens))
    args = ("./stemwords", "-l", "lt", "-i", "tokens.txt", "-o", "stems.txt")
    popen = subprocess.Popen(args, stdout=subprocess.PIPE)
    popen.wait()
    with open("stems.txt", "r") as stem_file:
        stems = stem_file.read().split("\n")
    
    # put into dic
    d["tokens"] = new_tokens
    d["stems"] = stems
    num_stems += [len(stems)]


#     print("In total tokens: {}, stop words removed: {}, stems: {}".format(num_tok, stop_tokens, num_stems))
# text_file.close()
#print(len([s for s in d['stems'] for d in data]))
# save as file
with open("delfi_pre.json", "w") as write_file:
    json.dump(cleaned_data, write_file)

CPU times: user 6.92 s, sys: 16.2 s, total: 23.1 s
Wall time: 42.7 s


In [19]:
for num in [num_tok, stop_tokens, num_stems]:
    print("{} {} {}".format(np.mean(num), np.min(num), np.max(num)))

415.8876293740759 97 3335
85.43592902907837 2 949
331.45170034499756 88 2387


# The Analysis

In [14]:
import json
import numpy as np
with open("delfi_pre.json", "r") as read_file:
    data = json.load(read_file)

stems = [d["stems"] for d in data]
tokens = [d["tokens"] for d in data]
category_names = ['Auto', 'Veidai', 'Sportas', 'Mokslas', 'Verslas']
categorys = np.array([category_names.index(d["categorys"]) for d in data])

In [15]:
import itertools

allstems = list(itertools.chain.from_iterable(stems))
alltokens = list(itertools.chain.from_iterable(tokens))
# print(stems[1])
# print(tokens[1])
print (len(set(allstems)))
print (len(set(alltokens)))

47707
141370


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=(47581 // 2)) # half of total number of features
%time X = vectorizer.fit_transform(stems)
print(X.shape)

CPU times: user 1.95 s, sys: 56 ms, total: 2.01 s
Wall time: 2.07 s
(4058, 23790)


In [4]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

K = 5
jobs = -1

KMtitle = "K-means"
KMmodel = KMeans(n_clusters=K,
#                  max_iter=1,
#                  n_init=1,
                 n_jobs=jobs,
                 random_state=42,)

EMtitle = "Expectation–maximization"
EMmodel = GaussianMixture(n_components=K,
                        covariance_type='diag',
#                         n_init=10,
                        random_state=42,)

ACtitle = "Complete-linkage clustering"
ACmodel = AgglomerativeClustering(n_clusters=K,
                                  linkage='complete',)
AAtitle = "Average-linkage clustering"
AAmodel = AgglomerativeClustering(n_clusters=K,
                                  linkage='average',)
AWtitle = "Ward-linkage clustering"
AWmodel = AgglomerativeClustering(n_clusters=K,
                                  linkage='ward',)
DBSCANtitle = "DBSCAN"
DBSCANmodel = DBSCAN(n_jobs = jobs,)

models = [
          {"model": KMmodel, "title": KMtitle},
          {"model": EMmodel, "title": EMtitle},
          {"model": ACmodel, "title": ACtitle},
          {"model": AAmodel, "title": AAtitle},
          {"model": AWmodel, "title": AWtitle},
          {"model": DBSCANmodel, "title": DBSCANtitle},
         ]

## Set up evaluation functions

In [5]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import *
from scipy.stats import mode

def get_new_labels(clusters):
    new_labels = np.zeros_like(clusters)
    print("New labels:")
    for i in range(K):
        mask = (clusters == i)
        closest_category = mode(categorys[mask])[0][0]
        new_labels[mask] = closest_category
        print("{} -> {}({})".format(i, closest_category, category_names[closest_category]))
    print(np.bincount(new_labels))
    return new_labels

def print_top_terms(model):
    print("Top terms per cluster:")
    centers = model.cluster_centers_ if isinstance(model, KMeans) else model.means_
    order_centroids = centers.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(K):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

def print_metrics(y_pred):
    print("Clustering print_metrics:")
    print(" Rand   Mutual information   Homogeneity   Coompleteness   V-measure    Fowlkes mallows")
    print("{0:.3f}                {1:.3f}         {2:.3f}           {3:.3f}       {4:.3f}      {5:.3f}"
      .format(adjusted_rand_score(categorys, y_pred),
              adjusted_mutual_info_score(categorys, y_pred),
              homogeneity_score(categorys, y_pred),
              completeness_score(categorys, y_pred),
              v_measure_score(categorys, y_pred),
              fowlkes_mallows_score(categorys, y_pred),
              ))

def plot_confusion_matrix(y_pred, title='clusters'):
    cm = confusion_matrix(categorys, y_pred)
    plt.figure(figsize=(5,5))
    plt.imshow(cm, interpolation='nearest', cmap = plt.cm.Blues)
    plt.title("Confusion matrix of " + title)
    tick_marks = np.arange(len(category_names))
    plt.yticks(tick_marks, category_names)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    # put numbers inside cells
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.show()
    
# KMmodel = KMeans(n_clusters=K,
#                  max_iter=10,
#                  n_init=1,
#                  n_jobs=-1,
#                  random_state=42,)
# type(KMmodel)

## Run models and metrics

In [8]:
def metrics_and_martix(clusters):
    print_metrics(clusters)
    plot_confusion_matrix(clusters, title=m['title'])
    new_labels = get_new_labels(clusters)
    print_metrics(new_labels)
    plot_confusion_matrix(new_labels, title=m['title'])

for m in models:
    model = m['model']
    print('\n' + m['title'] + " results")
    
    if m['title'] == KMtitle:
        %time clusters = model.fit_predict(X)
        print(np.unique(clusters, return_counts=True)[1])
        
        print_top_terms(model)
        metrics_and_martix(clusters)
        
    elif m['title'] == EMtitle:
        %time model.fit(X.toarray())
        clusters = model.predict(X.toarray())
        print(np.unique(clusters, return_counts=True))
        
        print_top_terms(model)
        metrics_and_martix(clusters)
        
    elif m['title'] in [ACtitle, AAtitle, AWtitle]:
        %time clusters = model.fit_predict(X.toarray())
        print(np.unique(clusters, return_counts=True))
        
        metrics_and_martix(clusters)
        
    elif m['title'] == DBSCANtitle:
        for e in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]:
            for m in [3, 4, 5, 6, 7, 8]:
                model.set_params(eps = e, min_samples = m,)
                clusters = model.fit_predict(X)
                
                results = np.unique(clusters, return_counts=True)
                if results[0][0] == -1: #if there was noise 
                    n_noise    = results[1][0]
                    n_clusters = np.sort(results[1][1:])[::-1]
                else:
                    n_noise    = 0      #if there was no noise 
                    n_clusters = np.sort(results[1])[::-1]
                print ("ε=%.1f min=%i: noise=%4i clusters=%3i top10=%s" 
                       %(e, m, n_noise, len(n_clusters), n_clusters[:10]))
    else:
        print(m)


DBSCAN results
ε=1.1 min=3: noise=2287 clusters=177 top10=[144 138 121  89  88  66  63  49  49  41]
New labels:
0 -> 0(Auto)
1 -> 2(Sportas)
2 -> 4(Verslas)
3 -> 3(Mokslas)
4 -> 4(Verslas)
[3808    0  138   14   98]
Clustering print_metrics:
 Rand   Mutual information   Homogeneity   Coompleteness   V-measure    Fowlkes mallows
0.008                0.051         0.052           0.296       0.089      0.427


TypeError: 'int' object is not subscriptable