# Abstract

 Word2Vec을 활용하여 연도별로 GNI 코퍼스 내의 keywords들을 Clusturing한다.

In [217]:
# import nltk
import nltk
from nltk.corpus import *
from nltk import *

# libraries for word clustering
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import joblib

# import libraries for preprocessing
import re

# libraries for visualization
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity

import time

In [218]:
# using sentence tokenized gni corpus.
corpus_root = 'C:/Users/Maeg/nltk_data/Genomics-Informatics-Corpus-master/Genomics-Informatics-Corpus-master/GNI Corpus 1.0/sentence_tokenized'
GNIcorpus = PlaintextCorpusReader(corpus_root,'.*\.txt',encoding='utf-8')


# 전처리

In [219]:
# preprocessing each sentence
def gni_preprocessed_sents(gni_sents):
    
    # load stopwords and initialize preprocessed sentences list
    stops = set(stopwords.words("english"))
    wnl = nltk.WordNetLemmatizer()
    gni_pre_sents = []
    
    # remove non-letter words and stop words in each sentence
    for s in gni_sents :
        sent = []
        for w in s :
            if (re.match(r'[a-zA-Z0-9]+',w) !=  None) & (not w in stops) :
                sent.append(wnl.lemmatize(w.lower()))
        gni_pre_sents.append(sent)
    
    # return preprocessed sentences list
    return gni_pre_sents

In [220]:
# preprocessing GNI corpus
print("Start preprocessing... : G&I sentences")
gni_sents = gni_preprocessed_sents(GNIcorpus.sents())
print("End preprocessing: G&I sentences")

Start preprocessing... : G&I sentences
End preprocessing: G&I sentences


# Modeling and Training Word2Vec 

In [221]:
n_features = 300       # the number of Word2Vector Dimension
min_wordcount = 40     # Words which counts at least 20 times are available
n_workers = 4          # the number of parallel work
size_boundary = 10      # contextual window size
downsampling = 1e-3    # down sampling with frequent words

# Initialize and train Word2Vec model
print("Start Training ...")
model = Word2Vec(gni_sents,
                 workers = n_workers,
                 size = n_features,
                 min_count = min_wordcount,
                 window = size_boundary,
                 sample = downsampling)
print("End Training ")

# manage memory
model.init_sims(replace=True)

# save trained model
model.save("GNI_"+str(n_features)+"features_"+str(min_wordcount)+"minwords_"+str(size_boundary)+"window")

Start Training ...
End Training 


# Word Clustering

Word2Vec으로 얻은 각 단어의 word vector를 이용하여 word cluster한다.

In [222]:
# clustering start time
start = time.time()
print("Start: Clustering")

# word vector of word2Vec model
gniwv = model.wv

# set the number of cluster
num_clusters = (int)(len(gniwv.vocab) / 5)

# Initialize a k-means object and use it to extract centroids(cluster)
kmeans_clustering = KMeans( n_clusters = num_clusters)
index = kmeans_clustering.fit_predict(gniwv.vectors)

end = time.time()
clustering_time = end - start
print("End  : Clustering")

# print Clustering information
print("\n============== Clustering Information ==============")
print("Clustering time:", clustering_time)
print("# of Word vectors:", len(gniwv.vocab))
print("# of Clusters:", num_clusters)

Start: Clustering
End  : Clustering

Clustering time: 38.91790795326233
# of Word vectors: 2191
# of Clusters: 438


In [223]:
# save the cluster models
joblib.dump(kmeans_clustering, 'GNI_words_cluster.pkl')

['GNI_words_cluster.pkl']

In [224]:
# word - cluster number matching
word_centroid_map = dict(zip(gniwv.index2word, index))

# Print whole clutering results
for cluster in range(num_clusters) :
    print('\nCluster',cluster)
    words = []
    for k, c in word_centroid_map.items():
        if c == cluster :
            words.append(k)
    print(words)


Cluster 0
['still', 'microbial', 'limited', 'advance', 'epigenome', 'effort', 'metagenomic', 'technical', 'enabled']

Cluster 1
['at', 'degree', 'similarly', 'circle', 'fish', 'black']

Cluster 2
['altered', 'oncogenic']

Cluster 3
['standard', 'initial', 'obtain', 'check', 'normalization', 'metric']

Cluster 4
['obtained', 'quality', 'count', 'after', 'call', 'removed', 'filtered']

Cluster 5
['population', 'asian', 'breed', 'cattle', 'hanwoo', 'holstein', 'african', 'pig', 'ancestry', 'east']

Cluster 6
['occurred', 'cow', 'period', 'metritis', 'prevalent', 'half']

Cluster 7
['faire', 'tss', 'upstream', 'island', 'transcribed', 'h3k4me3', 'h3k9', 'nucleosome', 'h3k4me1']

Cluster 8
['20', '30', '50', '40', '60', '80', 'mm', '70', '65']

Cluster 9
['approach', 'developed', 'knowledge', 'make', 'technique', 'computational', 'mining', 'efficient']

Cluster 10
['increase', 'negative', 'reduced', 'decrease', 'specificity']

Cluster 11
['search', 'query', 'blast']

Cluster 12
['cell', 'm

['addition', 'contrast']

Cluster 117
['repeat', 'motif', 'start', 'homologous', 'frame', 'reading', 'tandem', 'sine']

Cluster 118
['similarity', 'cluster', 'node', 'represented', 'edge']

Cluster 119
['human', 'specie', 'plant']

Cluster 120
['sample']

Cluster 121
['putative', 'zinc', 'interacting', 'duplex']

Cluster 122
['physical', 'smoking', 'alcohol', 'consumption', 'intake']

Cluster 123
['present', 'particular', 'fact', 'conclusion']

Cluster 124
['involved', 'mediated', 'activation', 'metabolism', 'formation', 'act', 'regulate', 'targeting', 'emt', 'tm4sf5', 'regulator', 'stress', 'transport', 'regulating', 'biosynthesis', 'initiation', 'transduction']

Cluster 125
['on', 'indicates', 'hand', 'tc', 'cai']

Cluster 126
['respectively', 'ci', '64', '51', 'mg', '57', '75', '85', 'dl', '83', '71', '79']

Cluster 127
['area', 'curve', 'auc', 'roc', 'operating', 'receiver']

Cluster 128
['data', 'datasets']

Cluster 129
['patient', 'normal', 'blood', 'plasma']

Cluster 130
['title

['primate', 'eukaryotic', 'throughout', 'transposable', 'numt']

Cluster 263
['see', 'summarized', 'listed']

Cluster 264
['proposed']

Cluster 265
['for']

Cluster 266
['list', 'deg']

Cluster 267
['rt', 'iii', 'transgenic']

Cluster 268
['9', '15', '16', '12', '11', '13', '14']

Cluster 269
['model', 'prediction']

Cluster 270
['sequence', 'short', 'longer', 'chromhmm']

Cluster 271
['contains', 'content', 'genus', 'occurrence', 'representing', 'respect', 'symbol', 'match']

Cluster 272
['complex']

Cluster 273
['moreover', 'generally', 'gain', 'majority', 'seems', 'determining', 'timing', 'occurring']

Cluster 274
['component']

Cluster 275
['provided', 'public', 'currently', 'biomedical', 'publication', 'repository', 'pubmed', 'guideline']

Cluster 276
['rna']

Cluster 277
['the']

Cluster 278
['wide']

Cluster 279
['system']

Cluster 280
['ethnic', 'validated', 'asia', 'replicated', 'parent', 'trend', 'pulse']

Cluster 281
['better', 'improve', 'investigate', 'evaluate', 'validate

# Multidimensional Scaling

In [None]:
# calculate distribution
dist = 1-cosine_similarity(gniwv.vectors)

# initialize MDS
MDS()

# scaling multidimensions
mds = MDS(n_components=2, dissimilarity = "precomputed", random_state=1)
pos = mds.fit_transform(dist)

xs, ys = pos[:,0], pos[:,1]

# Visualization of Word vectors

In [None]:
# set colors and names of clusters
cluster_colors = {35: '#1b9e77', 36: '#d95f02', 37: '#7570b3', 38: '#e7298a', 39: '#66a61e'}
cluster_names = {35: 'Cluster35', 
                 36: 'Cluster36', 
                 37: 'Cluster37', 
                 38: 'Cluster38', 
                 39: 'Cluster39'
                }

In [None]:
%matplotlib inline

# Create (x-position(2D), y-position(2D), word, cluster number) table
df_all = pd.DataFrame(dict(x=xs, y=ys,word=gniwv.index2word, label=index))

# Show only five Clusters, 35, 36, 37, 38, 39
condition = ( df_all['label'] >= 35 ) & ( df_all['label'] < 40 )
df = df_all[condition]

# grouping with cluster number
groups = df.groupby('label')
df

In [None]:
fig, ax = plt.subplots(figsize=(17, 9)) # set size

ax.margins(0.05)

for cluseter_n, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
                label=cluster_names[cluseter_n],
            color=cluster_colors[cluseter_n], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')

ax.legend(numpoints=1)

for i in range(len(df)):
    ax.text(df.iloc[i]['x'],
            df.iloc[i]['y'],
            df.iloc[i]['word'], size=12)  
plt.show()

# Find the cluster which include a specific word

In [None]:
# find cluster result with specipic keyword
def find_cluster_with_key(key):
    # if there is not key in vocab, do nothing
    if not key in word_centroid_map.keys() :
        print("That word is not in keys")
    # if there is key in vocab, print cluster result
    else :  
        cluster = word_centroid_map[key]
        print('\nCluster',cluster)
        words = []
        for k, c in word_centroid_map.items():
            if c == cluster :
                words.append(k)
        print(words)

In [None]:
key = input("============= Find Clustering ==============\nEnter a keyword: ")
find_cluster_with_key(key)
