In [4]:
import argparse
import joblib
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import numpy as np
from sklearn.model_selection import GridSearchCV
import os
import pandas as pd
from pandas.testing import assert_frame_equal
import re
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import (
    KMeans,
    AgglomerativeClustering,
    DBSCAN,
)
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    adjusted_mutual_info_score,
    adjusted_rand_score,
    completeness_score,
    silhouette_score,
)
from sklearn.preprocessing import MinMaxScaler

In [11]:
# TF-IDF parameters
MAX_FEATURES = 500
MIN_DF = 50
MAX_DF = 500

MODEL_PATH = "models\\"

def concatenate_data():
    data = []
    columns = ["news_group", "article_number", "cluster", "text"]
    path = "20_newsgroups"
    news_group_index = -1
    for news_group in os.listdir(path):
        news_group_index += 1
        # append all articles in news group to data
        for article in os.listdir(os.path.join(path, news_group)):
            with open(os.path.join(path, news_group, article), "r") as f:
                text = f.read()
            data.append([news_group_index, article, -1, text])
    df = pd.DataFrame(data, columns=columns)
    df["news_group"] = pd.to_numeric(df["news_group"])
    df["article_number"] = pd.to_numeric(df["article_number"])
    df["cluster"] = pd.to_numeric(df["cluster"])
    return df

def preprocess(text_data):
    nltk.download("stopwords", quiet=True)
    tokenizer = RegexpTokenizer(r"\w+")
    stop_words = set(stopwords.words("english"))
    # remove headers
    text_data = text_data.apply(
        lambda x: re.sub(r'^.*?\n\n', '', x, flags=re.DOTALL)
    )
    # tokenize and lowercase
    text_data = text_data.apply(
        lambda x: tokenizer.tokenize(x.lower())
    )
    # remove stop words
    text_data = text_data.apply(
        lambda x: [word for word in x if word not in stop_words]
    )
    text_data = text_data.apply(lambda x: " ".join(x))
    return text_data

def load_or_create_preprocessed():
    print("Loading or creating preprocessed data...")
    # use preprocessed data file if it exists, else create it
    if os.path.exists("data/articles_preprocessed.pkl"):
        df_articles = pd.read_pickle("data/articles_preprocessed.pkl")
    else:
        df_articles = concatenate_data()
        df_articles["text"] = preprocess(df_articles["text"])
        # drop all rows where "text" is only whitespace or empty
        df_articles = df_articles[df_articles["text"].str.strip().astype(bool)]
        df_articles.to_csv("data/articles_preprocessed.csv", index=False)
        df_articles.to_pickle("data/articles_preprocessed.pkl")
    return df_articles

def load_or_create_tfidf(df_articles):
    print("Loading or creating TF-IDF data...")
    # use tfidf data file if it exists, else create it
    if os.path.exists("data/articles_tfidf.pkl"):
        df_articles_tfidf = pd.read_pickle("data/articles_tfidf.pkl")
    else:
        # creating a new TF-IDF matrix
        tfidf = TfidfVectorizer(stop_words="english", strip_accents="unicode", min_df=MIN_DF)
        tfidf = TfidfVectorizer(max_features=MAX_FEATURES, strip_accents="unicode", min_df=MIN_DF, max_df=MAX_DF)
        tfidf_article_array = tfidf.fit_transform(df_articles["text"])
        df_articles_tfidf = pd.DataFrame(tfidf_article_array.toarray(), index=df_articles.index, columns=tfidf.get_feature_names_out())
        df_articles_tfidf.to_csv("data/articles_tfidf.csv", index=False)
        df_articles_tfidf.to_pickle("data/articles_tfidf.pkl")
    return df_articles_tfidf

def load_or_create_pca(df_articles_tfidf):
    print("Loading or creating PCA data...")
    # use pca data file if it exists, else create it
    if os.path.exists("data/articles_pca.pkl"):
        df_articles_pca = pd.read_pickle("data/articles_pca.pkl")
    else:
        # using PCA to reduce the dimensionality
        scaler = MinMaxScaler()
        data_rescaled = scaler.fit_transform(df_articles_tfidf)
        # variance explained by 90% of components
        pca = PCA(n_components = 0.90)
        pca.fit(data_rescaled)
        reduced = pca.transform(data_rescaled)
        df_articles_pca = pd.DataFrame(data=reduced)
        df_articles_pca.to_pickle("data/articles_pca.pkl")
        # Calculate the variance explained by principle components
        print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))
        print(' Number of components:', pca.n_components_)
    return df_articles_pca

def print_metrics(df_articles, df_articles_labeled):
    # print metrics
    print(f"Adjusted Mutual Information: {adjusted_mutual_info_score(df_articles['news_group'], df_articles_labeled['cluster'])}")
    print(f"Adjusted Rand Score: {adjusted_rand_score(df_articles['news_group'], df_articles_labeled['cluster'])}")
    print(f"Completeness Score: {completeness_score(df_articles['news_group'], df_articles_labeled['cluster'])}")
    print()

def findOptimalEps(n_neighbors, data):
    '''
    function to find optimal eps distance when using DBSCAN; based on this article: https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc
    '''
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    nbrs = neigh.fit(data)
    distances, indices = nbrs.kneighbors(data)
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    plt.plot(distances)
    plt.show()

def cluster_using_kmeans(df_articles, df_for_prediction, ncluster):
    # using KMeans clustering
    print(f"Clustering using KMeans with {ncluster} clusters")
    kmeans = KMeans(n_clusters=ncluster, n_init='auto')
    articles_predictions = kmeans.fit_predict(df_for_prediction)
    df_articles_predicted = df_articles.copy()
    df_articles_predicted["cluster"] = articles_predictions
    # TODO: save model
    joblib.dump(kmeans, MODEL_PATH+'kmeans_n' +str(ncluster)+'.joblib')
    return df_articles_predicted

def cluster_using_whc(df_articles, df_for_prediction, ncluster):
    # using Ward Hierarchical Clustering
    print(f"Clustering using Ward Hierarchical Clustering with {ncluster} clusters")
    whc = AgglomerativeClustering(n_clusters=ncluster, linkage="ward")
    articles_predictions = whc.fit_predict(df_for_prediction)
    df_articles_predicted = df_articles.copy()
    df_articles_predicted["cluster"] = articles_predictions
    # TODO: save model
    joblib.dump(whc, MODEL_PATH+'whc_n' +str(ncluster)+'.joblib')
    return df_articles_predicted

def cluster_using_ac(df_articles, df_for_prediction, ncluster):
    # using Agglomerative Clustering
    print(f"Clustering using Agglomerative Clustering with {ncluster} clusters")
    ac = AgglomerativeClustering(n_clusters=ncluster, linkage="average")
    articles_predictions = ac.fit_predict(df_for_prediction)
    df_articles_predicted = df_articles.copy()
    df_articles_predicted["cluster"] = articles_predictions
    # TODO: save model
    joblib.dump(ac, MODEL_PATH+'ac_n' +str(ncluster)+'.joblib')
    return df_articles_predicted

def my_silhouette_score(estimator, X):
    labels = estimator.fit_predict(X)
    score = silhouette_score(X, labels)
    return score

def cluster_using_dbscan(df_articles, df_for_prediction, ncluster):
    # using DBSCAN clustering
    print(f"Clustering using DBSCAN with {ncluster} clusters")
    # findOptimalEps(2, df_for_prediction)
    dbscan = DBSCAN()
    param_grid = {
    'eps': [0.5, 1.0, 1.5],
    'min_samples': [5, 10, 15],
    'metric': ['euclidean', 'manhattan'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50]
    }
    grid = GridSearchCV(dbscan, param_grid, cv=5, verbose=0, n_jobs=-1,scoring=my_silhouette_score)
    grid.fit(df_for_prediction)
    best_estimator = grid.best_estimator_
    print("Best parameters:", grid.best_params_)
    print("Best score:", grid.best_score_)
    articles_predictions = best_estimator.predict(df_for_prediction)
    df_articles_predicted = df_articles.copy()
    df_articles_predicted["cluster"] = articles_predictions
    # TODO: save model
    joblib.dump(dbscan, MODEL_PATH+'dbscan_n' +str(ncluster)+'.joblib')
    return df_articles_predicted

In [12]:
df_articles = load_or_create_preprocessed()
df_articles_tfidf = load_or_create_tfidf(df_articles)
# df_articles_pca = load_or_create_pca(df_articles_tfidf)
ncluster=10
df_for_prediction = df_articles_tfidf
dbscan = cluster_using_dbscan(df_articles, df_for_prediction, ncluster)
print_metrics(df_articles, dbscan)

Loading or creating preprocessed data...
Loading or creating TF-IDF data...
Clustering using DBSCAN with 10 clusters
