In [4]:
import sys
sys.path.append("../src")
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load scripts
from text_cleaning import clean_text
from clustering import KMeansAuthors, ModHausdorffDocument
from style_point_cloud import style_point_cloud

# Progress bar
from tqdm import tqdm 
tqdm.pandas()

In [None]:
df = pd.read_json("../data/csCL_sample.json")

# Text cleaning
df.text_body = df.text_body.progress_apply(clean_text)

# Generate point cloud for each text. Since this is shorter, we must
# take less tokens
df["point_cloud"] = df.text_body.progress_apply(lambda text: style_point_cloud(text, window_size=600, window_overlap=300, max_tokens=15_000))

# Remove documents with no points
df = df[df.point_cloud.apply(len) > 0].copy()

In [32]:
# This subsample was taking by considering if Diptesh Kanojia, Hannes Westermann, or Bing Liu
# were among the first 2 authors, so let just first use them as sole authors. 
author_list = ["Diptesh Kanojia", "Hannes Westermann", "Bing Liu"]
df["author_single"] = df.authors.apply(lambda x: set(x).intersection(author_list).pop())

# Create an author label for each point
auth_labels = [[author]*n_chunks for author, n_chunks in 
                zip(df.author_single, df.point_cloud.apply(len))]

# Flatten
auth_labels = [x for y in auth_labels for x in y]

In [80]:
data = StandardScaler().fit_transform(np.vstack(df.point_cloud))
X = PCA(n_components=5).fit_transform(data)

cl = KMeansAuthors(n_authors=3)

In [81]:
cl.fit(X, auth_labels)
cl.best_score

0.4467005076142132

In [82]:
# Document accuracy
(df.author_single == cl.predict_document(X, df.point_cloud.apply(len).to_numpy())).sum() / df.author_single.shape[0]

0.5714285714285714

In [None]:
predictions = cl.predict(X, author_labels=True)
auth_idx = dict(zip(set(predictions), range(len(set(predictions)))))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
for label in set(predictions): 
    mask_pred = [True if pr == label else False for pr in predictions]
    mask_true = [True if pr == label else False for pr in auth_labels]
    ax1.scatter(X[:, 0][mask_pred], X[:, 1][mask_pred], label=label)
    ax2.scatter(X[:, 0][mask_true], X[:, 1][mask_true], label=label)
    ax1.legend()
    ax2.legend()
    ax1.set_title("KMeans Predicted")
    ax2.set_title("True labels")

In [84]:
mhcl = ModHausdorffDocument(n_authors=3)
mhcl.fit(X, doc_lengths=df.point_cloud.apply(len).to_numpy(), author_labels=df.author_single)
mhcl.best_score

0.7142857142857143