* Dependencies for the model is: sklearn, numpy, keras,networkx, pyplot, wordcloud, gensim, matplotlib, collections

In [1]:
import warnings
warnings.filterwarnings("ignore")
import json
import os
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import SGD
from sklearn.cluster import KMeans
from collections import defaultdict,Counter
import networkx as nx
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import gensim, logging

Using TensorFlow backend.


* Here we start the functions in the moedl

In [3]:
def load_data(dir,round):
    
    '''load original data and keep titles and content.
    output a dict in which keys are titles and values are content'''
    
    data = []
    for i in range(1,6):
        with open(os.path.join(dir,'r%d/%d.json'%(round,i)),'r') as d:
            tem_data = json.load(d)
        data.extend(tem_data['result']['docs'])
    data_dict = {data[i]['title']:data[i]['content'] for i in range(len(data))}
    return data_dict

def convert_score(nlu_dic):
    
    '''the model uses Watson\'s result as features
      Here for each article, a dict is generated in which the keys are features and values are scores
      And the main output is a big dict containing all article dict in which keys are titles and values are article dict'''
    
    feature = {'categories' : ['label','score'],'concepts' : ['text','relevance'],'entities': ['text','relevance'],'keywords':['text','relevance']}
    convert_dic = {}
    convert = []
    for key in nlu_dic.keys():
        tem = {}
        tem = {nlu_dic[key][f][i][feature[f][0]]:nlu_dic[key][f][i][feature[f][1]] for f in feature.keys() for i in range(len(nlu_dic[key][f]))}
        convert.append(tem)
        convert_dic[key] = tem
            
    return convert_dic,convert

def autoencoder(dims, act='relu', init='glorot_uniform'):
    
    '''here we build a five layers autoencoder to lower dimensions of feature matrix
    the output is antuencoder and encoder'''
    
    n_stacks = len(dims) - 1
    x = Input(shape=(dims[0],), name='input')
    h = x
    for i in range(n_stacks-1):
        h = Dense(dims[i + 1], activation=act, kernel_initializer=init, name='encoder_%d' % i)(h)
    h = Dense(dims[-1], kernel_initializer=init, name='encoder_%d' % (n_stacks - 1))(h)  # hidden layer, features are extracted from here
    y = h
    for i in range(n_stacks-1, 0, -1):
        y = Dense(dims[i], activation=act, kernel_initializer=init, name='decoder_%d' % i)(y)
    y = Dense(dims[0], kernel_initializer=init, name='decoder_0')(y)
    return Model(inputs=x, outputs=y, name='AE'), Model(inputs=x, outputs=h, name='encoder')

def give_label(labels,data):
    '''after clustering, articles in each cluster are represented by number. 
    here we assign titles to artiles in each cluster'''
    
    have_label = defaultdict(list)
    for i in range(len(labels)):
        have_label[labels[i]].append(list(data.keys())[i])
    return have_label


def get_top20(dic):
    '''after obtaining important feature, we want to remove some features that frequently appear in all clusters
    and split those features to two layers: 1. entities 2. themes'''

    
    features = ['concepts','categories','entities','keywords']
    theme_dic = defaultdict(dict)
    entities_dic = defaultdict(dict)
    
    themes = []
    entities = []
    
    
    tem = []
    tem_score = {}
    
    for key in dic.keys():
        for feature in features:
            tem.extend(list(dic[key][feature]))
            
    for word in tem:
        tem_score[word] = tem.count(word)/10
       
    for key in dic.keys():
        
        for theme in ['concepts','categories']:
            for word in dic[key][theme]:
                theme_dic[key][word] = tem_score[word]
                
        for entity in ['entities','keywords']:
            for word in dic[key][entity]:
                entities_dic[key][word] = tem_score[word]

    for i in range(10):
        themes.append(sorted(theme_dic['cluster%d'%i].items(), key = lambda item:item[1])[:10])
        
        entities.append(sorted(entities_dic['cluster%d'%i].items(), key = lambda item:item[1])[:25])
    return themes,entities

def get_top5_sentence(kw,n,data):
    
    ''' after training a Word2Vec model, 
    here we compute similarities between each sentence and keywords,
    output the top 5 similar sentences.
    '''
    kw_list = []
    sentence = []
    sentence_score = {}
    model = word2vect(data)
    for word in kw[n]:
        kw_list.extend(word[0].lower().split())
    for key in data.keys():
        sentence.extend(data[key].split('\n'))
    for s in sentence:
        tem = s.lower().split()
        if tem != []:
            sentence_score[s] = model.n_similarity(kw_list,tem)
    top_5_score = sorted(sentence_score.items(), key = lambda item:item[1],reverse=True)[:5]
    top_5 = [m[0] for m in top_5_score]
    return top_5

def get_important_feature(title_list,nlu_data):
    
        '''here we obtain most frequent features of each cluster
     we have 4 types of features, and keep 20 for each which sum to 80'''
    
    features = {'concepts' : 'text','keywords':'text'}
    features_lis = ['categories' ,'concepts' ,'entities','keywords']
    
    entities_filter = ['Location','Person']
    keywords_filter = ['Mr','Ms']
    
    C = defaultdict(list)
    result = defaultdict(list)
    for title in title_list:
        for i in range (len(nlu_data[title]['concepts'])):        
            C['concepts'].append(nlu_data[title]['concepts'][i]['text'])
                
        for i in range (len(nlu_data[title]['keywords'])): 
            for fil in keywords_filter:
                if fil not in nlu_data[title]['keywords'][i]['text']:
                    C['keywords'].append(nlu_data[title]['keywords'][i]['text'])
                
        for i in range (len(nlu_data[title]['categories'])):        
            C['categories'].extend(nlu_data[title]['categories'][i]['label'].split('/')[-2:])
                
        for i in range (len(nlu_data[title]['entities'])):
            if nlu_data[title]['entities'][i]['type'] not in entities_filter:
                C['entities'].append(nlu_data[title]['entities'][i]['text'])
                
    for feature in features_lis:
        tem = Counter(C[feature]).most_common(20)
            
        tem_list = []
    
        for i in range(20):
            tem_list.append(tem[i][0])
        result[feature] = tem_list
    
    return result


def init_visualisation(themes,entities,convert_dic,have_label):
    '''
    here we initialise some requirements of visualisetion: edges and nodes
    edges contain some tuples in the shape (theme,entity) or (entity, article)'''
    
    havelabel = defaultdict(dict)
    
    for key in have_label.keys():
        for i in range(len(have_label[key])):
            havelabel[key][have_label[key][i]] = i
            
    feature_bag = defaultdict(list)
    for title in convert_dic.keys():
        for feature in convert_dic[title]:
            feature_bag[title].extend(feature.split('/'))
            
    features = {'categories' : ['label','score'],'concepts' : ['text','relevance'],'entities': ['text','relevance'],'keywords':['text','relevance']}
   

    nodes = defaultdict(list)
    edges = defaultdict(dict)


    for i in range(len(themes)):
        tem = {(theme[0],entity[0]):0 for theme in themes[i] for entity in entities[i]}
        edges['cluster%d'%i] = tem
            
    for i in range(len(themes)):
        for title in havelabel[i]:
            for relation in edges['cluster%d'%i]:
                if relation[0] in feature_bag[title] and relation[1] in feature_bag[title]:
                    edges['cluster%d'%i][relation] +=1 
                    
        for title in havelabel[i].keys():
            for entity in entities[i]:
                if entity[0] in feature_bag[title]:
                    if entity[0] in convert_dic[title].keys():
                        edges['cluster%d'%i][(entity[0],havelabel[i][title])] = convert_dic[title][entity[0]]
                    else:
                        for key in convert_dic[title].keys():
                            if entity[0] in key:
                                edges['cluster%d'%i][(entity[0],havelabel[i][title])] = convert_dic[title][key]
                            break
    for i in range(len(themes)):
        edges['cluster%d'%i] = {k:v for k,v in edges['cluster%d'%i].items() if v!=0}
        for k,v in edges['cluster%d'%i].items():
            t = max(edges['cluster%d'%i].values())
            if v>=1:
                    edges['cluster%d'%i][k] = v/t
    return edges

def draw_cluster(edges,n,path):
    
    '''here we draw the relation graph of a cluster'''
    
    G = nx.DiGraph()
    G.add_edges_from(edges['cluster%d'%n],attr_dict = edges['cluster%d'%n])
    plt.figure(figsize = (100,15))
    p = nx.drawing.nx_pydot.graphviz_layout(G, prog='dot')
    nx.draw_networkx_nodes(G,p,node_size=3000,node_color = '#F8F8FF')
    for edge in G.edges(data = True):
        nx.draw_networkx_edges(G,p,edgelist =[(edge[0],edge[1])],alpha = edge[2]['attr_dict'][(edge[0],edge[1])],width=3,edge_color='#4682B4')
    nx.draw_networkx_labels(G,p,font_size=14)
    plt.axis('off')
    newpath = path+"/cluster%d/"%n
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    plt.savefig(newpath+"cluster%d.png"%n)
    
def draw_kw(edges,k,kw,path):
    
    '''here we draw relation graph of a theme'''
    
    k_e = [relation[1] for relation in edges['cluster%d'%k].keys() if kw in relation]
    k_e.append(kw)
    kw_dic = {k:v for k,v in edges['cluster%d'%k].items() if k[0] in k_e}
    g = nx.DiGraph()
    plt.figure(figsize = (100,10))
    g.add_edges_from(kw_dic,attr_dict = kw_dic)
    pp = nx.drawing.nx_pydot.graphviz_layout(g, prog='dot')
    nx.draw_networkx_nodes(g,pp,node_size=3000,node_color = '#F8F8FF')
    for edge in g.edges(data = True):
        nx.draw_networkx_edges(g,pp,edgelist =[(edge[0],edge[1])],alpha = edge[2]['attr_dict'][(edge[0],edge[1])],width=3,edge_color='#4682B4')
    nx.draw_networkx_labels(g,pp,font_size=14)
    plt.axis('off')
    newpath = path+"/cluster%d/keywords/"%k
    if not os.path.exists(newpath):
        os.makedirs(newpath)                
    plt.savefig(newpath+"%s.png"%kw)
    

def word2vect(data,kw_list):
    '''train and return word2vec model'''
    exception = ['surety','n']
    split = []
    for article in list(data.values()):
        split.append(article.lower().split())
    split.append(exception)
    split.append(kw_list)
    model = gensim.models.Word2Vec(split, min_count=1)
    return model



def get_result(nlu_data,data,path):
    
    '''get the results'''
    
    di,li = convert_score(nlu_data)  #convert features to scores
    DV = DictVectorizer(sparse=False)
    x = DV.fit_transform(li)           #get feature matrix
    
    dims=[x.shape[-1], 500, 500, 2000, 10]    #define dimension of autoencoder
    AE,encoder = autoencoder(dims, act='relu', init='glorot_uniform')   
    AE.compile(optimizer='adam', loss='mse')
    AE.fit(x, x, batch_size=30, epochs=5,verbose =0)      #train autoencoder
    x_encoder = encoder.predict(x)            #lower dimension of matrix
    
    n_clusters = 10       #set number of clusters 
    kmeans = KMeans(n_clusters=n_clusters,max_iter=600).fit(x_encoder)        #clustering
    have_label = give_label(kmeans.labels_,nlu_data)        #assign titles to artitles
    
    important_feature = {}
    result = defaultdict(dict)   #get important features
    important_feature = {'cluster%d'%i:get_important_feature(have_label[i],nlu_data) for i in range(10)}
    
    themes,entities= get_top20(important_feature)              #get themes and entites
    edges = init_visualisation(themes,entities,di,have_label)       #initialise requirements of visualisation
    
    
    for i in range(10):        #for each cluster, output visualisations
        
        draw_cluster(edges,i,path)
        c = [m[0] for m in themes[i]]
        result['Cluster %d'%i]['theme'] = c
        k = [m[0] for m in entities[i]]
        result['Cluster %d'%i]['entities'] = k
        for d in c:
            draw_kw(edges,i,d,path)
        keys = []
        for t in themes:
            tem = [k[0] for k in t]
            keys.extend(tem)
        for t in entities:
            tem = [k[0] for k in t]
            keys.extend(tem)
        g = get_top5_sentence(keys,i,data)
        result['Cluster %d'%i]['Sentences'] = g
        
    with open(path+'result.json','w') as f:      #save results of each cluster
        json.dump(result, f)
    return result

* Load data

In [4]:
dir='/Users/tommy/Desktop/COMP5703/data/'         #modify original data path here

data_r3 = load_data(dir,3)
data_r4 = load_data(dir,4)
data_r5 = load_data(dir,5)

with open('data_r3_nlu.json','r') as d:         #load watson_nlu data
    data_r3_nlu = json.load(d)
with open('data_r4_nlu.json','r') as d:
    data_r4_nlu = json.load(d)
with open('data_r5_nlu.json','r') as d:
    data_r5_nlu = json.load(d)

* Run model

In [None]:
%%time
 
path = dir+'round3/'         #modify folder name 'round3' 'round4' 'round5'

result = get_result(data_r3_nlu,data_r3,path)    #modify variables 'data_r?_nlu' and 'data_r?'