In [1]:
import numpy as np

from scipy.cluster import hierarchy as hac

import matplotlib.pyplot as plt
import pickle

# Tf idf Function

In [2]:
def tfidf(count_matrix):
    ''' Takes as input a count matrix 
        of ter occurencies in documents
        'doc-term' (N_doc x N_term)
        and returns a tfidf matrix of 
        same dimensions'''
        
    [total_number_of_documents, total_number_of_terms] = count_matrix.shape
    
    words_per_subreddit = np.sum(count_matrix, axis = 0)
    tf = count_matrix/words_per_subreddit
    
    subreddits_per_word = np.count_nonzero(count_matrix, axis = 1)
    df = subreddits_per_word/total_number_of_documents
    idf = np.log(np.reciprocal(df))   
    
    tf_idf = tf*idf[:,np.newaxis]
    return(tf_idf)

# Create dictionaries and Count Matrix

In [3]:
with open('data.train', 'r', 10000) as t:
    lines = t.read().splitlines()
    
print('Comments:')
print(len(lines))

voc_dict = {}
voc_dict_inv = {}
i = 0
with open('vocab.bpe.from','r') as f:
    keys = f.read().splitlines()
    for key in keys:
        if i>2 :
            voc_dict[i-3] = key
            voc_dict_inv[key] = i-3            
#             print(key)
        i+=1
print('\nVocabulary size:')
print(i)

with open('subreddits.train','r') as l:
    ls = l.read().splitlines()
    label_names = list(set(ls))
        
# print(label_names)

labels_dict = {}
labels_dict_inv = {}
for i, label_name in enumerate(label_names):
    labels_dict[i] = label_name
    labels_dict_inv[label_name] = i

print('\nLabels dictionary size:')
print(len(labels_dict))
# print(labels_dict_inv)

subredditsXvocwords = np.zeros([len(labels_dict), 15000])

for i, (comment, label) in enumerate(zip(lines,ls)):
    words = comment.split(' ')
    for w in words:
        subredditsXvocwords[labels_dict_inv[label], voc_dict_inv[w]] += 1

del(lines)
del(ls)

Comments:
5621488

Vocabulary size:
15003

Labels dictionary size:
638


# Clustering

In [4]:
subreddit_tf_idf = tfidf(subredditsXvocwords)
z = hac.linkage(subreddit_tf_idf, method = 'complete', metric = 'cosine')

In [5]:
# makes list of N clusters after (638-N) unions

k = np.zeros([len(labels_dict)-1, 2])
k[:,0]=z[:,0]
k[:,1]=z[:,1]
k = k.astype(int)
l = k.tolist()
m = [[] for x in range(2*len(labels_dict)-1)]
for i in range(len(labels_dict)):
    m[i].append(i)

N=40

for k in range(len(labels_dict)-N):
    for j in range(2):
        if l[k][j]<len(labels_dict):
            m[k+len(labels_dict)].append(l[k][j])
            m[l[k][j]]=[]
        else:
            for i in m[l[k][j]]:
                m[k+len(labels_dict)].append(i)
            m[l[k][j]]=[]

### Uncomment below to print dendrograms

In [6]:
# # prints N dendrograms (one per Cluster)

# subreddit_groupsXvocwords = np.zeros([N, 15000])

# counter=1
# subreddits_groups = []
# for t in m:
#     if t!=[]:
#         print('\n Cluster '+str(counter) + ' with '+ str(len(t)) +' subreddits')
#         for s in t:
#             print(labels_dict[s], end =', ')
#         if len(t) > 1 :
#             matrix = np.zeros([len(t),15000])
#             matrix_labels = []
#             for n, s in enumerate(t):
#                 matrix_labels.append(labels_dict[s])
#                 matrix[n,:] += subreddit_tf_idf[s,:]
#             sub_z = hac.linkage(matrix, method = 'complete', metric = 'cosine')
            
# #             mpl.rcParams['axes.titlesize'] = 60

#             if len(t)>150:
#                 plt.figure(figsize=(10, 60))
#                 hac.dendrogram(sub_z, labels = matrix_labels, orientation = 'left')
#                 ax = plt.gca()
#                 ax.tick_params(axis='x', which='major', labelsize=15)
#                 ax.tick_params(axis='y', which='major', labelsize=18)
#                 plt.title('Hierarchical Clustering Dendrogram of Cluster '+ str(counter), fontsize = 25)
#     #             plt.xlabel('Subreddits')
#                 plt.ylabel('Distance between Subreddits')
                
#             else:
#                 plt.figure(figsize=(25, 8))
#                 if len(t)>=10 and len(t)<30:   
#                     hac.dendrogram(sub_z, labels = matrix_labels, leaf_rotation=45.)
#                 else:
#                     hac.dendrogram(sub_z, labels = matrix_labels)
#                 ax = plt.gca()
#                 ax.tick_params(axis='x', which='major', labelsize=18)
#                 ax.tick_params(axis='y', which='major', labelsize=15)
#                 plt.title('Hierarchical Clustering Dendrogram of Cluster '+ str(counter), fontsize = 25)
#     #             plt.xlabel('Subreddits')
#                 plt.ylabel('Distance between Subreddits')

#             plt.show()
                
#             subreddits_groups.append('Cluster '+str(counter))
#         else:
#             subreddits_groups.append(t)
            
#         counter+=1               

# HAC vectors

In [7]:
subreddit_groupsXvocwords = np.zeros([N, 15000])

counter=0
subreddits_groups = []
for t in m:
    if t!=[]:
        for s in t:
            subreddit_groupsXvocwords[counter,:] += subredditsXvocwords[s,:]
        counter+=1

In [8]:
HAC_matrix = tfidf(np.transpose(subreddit_groupsXvocwords))
# HAC_matrix_ordered
HAC_matrix_ordered = np.zeros([15000,N])

HAC_matrix_ordered[0,:]=np.full((40),0.025)
HAC_matrix_ordered[1,:]=np.full((40),0.025)
HAC_matrix_ordered[2,:]=np.full((40),0.025)

with open('vocab.bpe.from','r') as f:
    keys = f.read().splitlines()
    for i, key in enumerate(keys):
        if i>2:
            HAC_matrix_ordered[i-3,:] += HAC_matrix[voc_dict_inv[key],:]
    

In [9]:
with open('HAC_vectors.pickle', 'wb') as h:
    pickle.dump(HAC_matrix_ordered, h)