In [1]:
import pandas as pd
import json
import gzip

In [2]:
with gzip.open('Event_Context.json', 'r') as fin:
    data = json.loads(fin.read().decode('utf-8'))

In [3]:
df = pd.read_csv('Final_Events.csv')
df.head(n = 2)

Unnamed: 0,Title,Desc,Start Date,When,Main Page,Address1,Address2,thumb,image
0,PGL Wallachia S3,Full information about PGL Wallachia S3 Dota 2...,Mar 8,"Sat, Mar 8",https://ggscore.com/en/dota-2/pgl-wallachia-se...,"PGL ESPORTS, Bulevardul Dimitrie Pompeiu 9-9A","Bucharest, Romania",https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...
1,BLAST Slam #3,Full information about BLAST Slam #3 Dota 2. M...,May 5,"Mon, May 5",https://ggscore.com/en/dota-2/blast-slam-3,,,https://encrypted-tbn0.gstatic.com/images?q=tb...,https://encrypted-tbn0.gstatic.com/images?q=tb...


In [4]:
documents = list(data.values())

In [5]:
documents = df['Desc'].fillna(value = '')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
import numpy as np

In [7]:
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
t0 = time()
X_tfidf = vectorizer.fit_transform(documents)

print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

vectorization done in 0.022 s
n_samples: 489, n_features: 308


In [8]:
print(f"{X_tfidf.nnz / np.prod(X_tfidf.shape):.3f}")

0.023


In [9]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

lsa_vectorizer = make_pipeline(
    HashingVectorizer(stop_words="english", n_features=50_000),
    TfidfTransformer(),
    TruncatedSVD(n_components=100, random_state=0),
    Normalizer(copy=False),
)

t0 = time()
X_hashed_lsa = lsa_vectorizer.fit_transform(documents)
print(f"vectorization done in {time() - t0:.3f} s")

vectorization done in 5.623 s


In [10]:
X_hashed_lsa.shape

(489, 100)

In [11]:
from sklearn.cluster import KMeans

for seed in range(5):
    kmeans = KMeans(
        n_clusters=4,
        max_iter=100,
        n_init=5,
        random_state=seed,
    ).fit(X_hashed_lsa)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [140  55 122 172]
Number of elements assigned to each cluster: [ 89 245 112  43]
Number of elements assigned to each cluster: [134  71  48 236]
Number of elements assigned to each cluster: [110  47 122 210]
Number of elements assigned to each cluster: [251  99  49  90]


In [12]:
clusters = [(kmeans.labels_[idx], d) for idx, d in enumerate(documents)]

In [13]:
from nltk.tokenize import RegexpTokenizer

In [14]:
tokenizer = RegexpTokenizer(r'\w+')

In [15]:
unqiue_sets = []
for idx in range(max(kmeans.labels_) + 1):
    current_docs = [d for label, d in clusters if label == idx]
    all_words    = set([word for document in current_docs for word in tokenizer.tokenize(document)])
    unqiue_sets.append(all_words)

In [16]:
s1,s2,s3,s4 = unqiue_sets

In [17]:
from itertools import combinations