In [4]:
# import pickle

# import pandas as pd
# import matplotlib.pyplot as plt

# from pathlib import Path

import pickle

import gensim
import hdbscan
import numpy as np
import pandas as pd
import seaborn as sns

from time import time
from pathlib import Path

from matplotlib import pyplot as plt
from scipy.stats import entropy
from sklearn import metrics
from sklearn.cluster import DBSCAN, KMeans
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors


from collections import defaultdict

In [5]:
from scipy.stats import entropy

# Clustering by parts

## With NLK

In [6]:
%%time
file = Path('/Volumes', 'tfm', 'processed', 'email_nltk_fq_st_ps_tkn.pkl')
df = pd.read_pickle(file)

CPU times: user 12.5 s, sys: 5.03 s, total: 17.6 s
Wall time: 34 s


In [7]:
extreme_vals = [132160, 480942, 337694]

In [8]:
df.loc[extreme_vals,:]

Unnamed: 0,id,fq_st_ps_tkn
132160,forney-j/sent_items/158.,"{'<': 54316, 'omni': 6274, '>': 53267, 'omnino..."
480942,presto-k/sent_items/1103.,"{'<': 44545, 'omni': 5120, '>': 44545, 'omnino..."
337694,quigley-d/sent_items/37.,"{'<': 10760, 'omni': 1401, '>': 10519, 'omnino..."


In [9]:
df.drop(extreme_vals, inplace=True)

In [10]:
D = df['fq_st_ps_tkn'].to_list()

In [11]:
train, test = train_test_split(D, test_size=0.8, random_state=2022)

### DictVectorizer

In [12]:
print("DictVectorizer")
t0 = time()
vectorizer = DictVectorizer()
# vectorizer.fit_transform(tokens)
vectorizer.fit_transform(train)
X = vectorizer.fit_transform(train).toarray()
print("Found %d unique terms" % len(vectorizer.get_feature_names_out()))

DictVectorizer
Found 249928 unique terms


In [None]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier()),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    "clf__max_iter": (20,),
    "clf__alpha": (0.00001, 0.000001),
    "clf__penalty": ("l2", "elasticnet"),
    # 'clf__max_iter': (10, 50, 80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
### DBSCAN

In [None]:
%%time
m = DBSCAN(eps=0.3, min_samples=5)
m.fit(X)

In [None]:
### HDBSCAN
def cluster_results(data):
    results = {}
    for metric in hdbscan.dist_metrics.METRIC_MAPPING.keys():
        try:
            if metric == 'minkowski':
                clusterer = hdbscan.HDBSCAN(metric=metric, p=0.05)
                clusterer.fit(data)
            elif metric == 'precomputed':
                distance_matrix = pairwise_distances(data)
                clusterer = hdbscan.HDBSCAN(metric=metric)
                clusterer.fit(distance_matrix)
            else:
                clusterer = hdbscan.HDBSCAN(metric=metric)
                clusterer.fit(data)

            results[metric] = entropy(clusterer.probabilities_)
        except Exception as e:
            print(f'{metric} metric failed: {str(e)}')
    

    return results

In [None]:
## Entropies from different methods
{k: v for k, v in sorted(cluster_results(X).items(), key=lambda item: -item[1])}

In [None]:
n_features = 2 ** 18

def n_nonzero_columns(X):
    """Returns the number of non-zero columns in a CSR matrix X."""
    return len(np.unique(X.nonzero()[1]))


print("FeatureHasher on frequency dicts")
t0 = time()
hasher = FeatureHasher(n_features=n_features)
X = hasher.transform(train)
duration = time() - t0
print("done in %fs" % (duration))
print("Found %d unique terms" % n_nonzero_columns(X))
print()

## With Spacy