# Word2Vec Model on Abstracts from Citation Network Dataset

In [1]:
from nltk.corpus import stopwords
import pandas as pd
import string

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') #ignoring the gensim warning related to the OS

from gensim.models import Word2Vec

In [2]:
# Clean text function
def clean_text(text):
    def replace_punctuation(sentence):
        if(pd.isnull(sentence)):
            return ""
        punctuation = string.punctuation
        for p in punctuation:
            sentence = sentence.replace(p, '')
        return sentence
    # Remove punctuation
    text = text.apply(replace_punctuation)
    # Remove numbers
    text = text.apply(lambda x: ''.join([c for c in x if not c.isdigit()]))
    # Remove words of length <= 1
    text = text.apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1]))
    # Convert to lower case
    text = text.apply(lambda x: x.lower() if pd.notnull(x) else '')
    # Remove stopwords
    text = text.apply(lambda x: ' '.join([xi for xi in x.split() if xi not in stopw]))
    
    return text

In [3]:
json_cit_parts = range(4)
df_json_cit = pd.DataFrame()
for i in json_cit_parts:
    f_json_cit = path + 'dblp-ref/dblp-ref-%d.json' % i
    df_json_cit = pd.concat([df_json_cit,
                             pd.read_json(f_json_cit, lines=True)])

In [4]:
df_json_cit.shape

(3079007, 7)

In [5]:
df_json_cit.head()

Unnamed: 0,abstract,authors,id,references,title,venue,year
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",00127ee2-cb05-48ce-bc49-9de556b93346,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",001c58d3-26ad-46b3-ab3a-c1e557d16821,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011
2,This article applied GARCH model instead AR or...,"[Altaf Hossain, Faisal Zaman, Mohammed Nasser,...",001c8744-73c4-4b04-9364-22d31a10dbf1,"[2d84c0f2-e656-4ce7-b018-90eda1c132fe, a083a1b...","Comparison of GARCH, Neural Network and Suppor...",pattern recognition and machine intelligence,2009
3,,"[Jea-Bum Park, Byungmok Kim, Jian Shen, Sun-Yo...",00338203-9eb3-40c5-9f31-cbac73a519ec,"[8c78e4b0-632b-4293-b491-85b1976675e6, 9cdc54f...",Development of Remote Monitoring and Control D...,,2011
4,,"[Giovanna Guerrini, Isabella Merlo]",0040b022-1472-4f70-a753-74832df65266,,Reasonig about Set-Oriented Methods in Object ...,,1998


In [6]:
df_json_cit_sample = df_json_cit.sample(frac=1)

In [7]:
words_train = clean_text(df_json_cit_sample['abstract']) \
                .apply(lambda line: line.split() if pd.notnull(line) else [])

In [8]:
vec_len = 200

In [9]:
model = Word2Vec(words_train.values, workers=8, size=vec_len)

In [10]:
model.wv.most_similar('network', topn=10)

[(u'networks', 0.8393702507019043),
 (u'subnetwork', 0.5993532538414001),
 (u'subnet', 0.5914520025253296),
 (u'internetwork', 0.5871062874794006),
 (u'network\u2019s', 0.5801593065261841),
 (u'endhosts', 0.5662047863006592),
 (u'subnets', 0.5600781440734863),
 (u'network\u201d', 0.5502232313156128),
 (u'networking', 0.5386593341827393),
 (u'backbone', 0.5377504825592041)]

In [11]:
model.wv.most_similar('algorithm', topn=10)

[(u'method', 0.8145387172698975),
 (u'algorithms', 0.7467148303985596),
 (u'technique', 0.744985818862915),
 (u'scheme', 0.736602783203125),
 (u'procedure', 0.7336603403091431),
 (u'approach', 0.6784576177597046),
 (u'strategy', 0.6749469041824341),
 (u'algorithmthe', 0.6109606623649597),
 (u'algorithmwe', 0.5741111636161804),
 (u'algorithm\u201d', 0.5737768411636353)]

In [12]:
model.wv.most_similar('operating', topn=10)

[(u'operate', 0.5807948708534241),
 (u'unixlike', 0.5556555986404419),
 (u'operated', 0.5468459129333496),
 (u'oses', 0.5337443351745605),
 (u'unixbased', 0.5104660987854004),
 (u'operation', 0.5023017525672913),
 (u'solaris', 0.5005874633789062),
 (u'operational', 0.4986445903778076),
 (u'os', 0.4885430932044983),
 (u'rtai', 0.48448365926742554)]

In [13]:
model.wv.most_similar('science', topn=10)

[(u'sciences', 0.8066760897636414),
 (u'science\u201d', 0.7324774265289307),
 (u'scientists', 0.6917772889137268),
 (u'informatics', 0.6285247802734375),
 (u'scienceengineering', 0.6193199157714844),
 (u'scientist', 0.6017591953277588),
 (u'scientific', 0.5988705158233643),
 (u'anthropology', 0.593280553817749),
 (u'clubhouse', 0.5872084498405457),
 (u'olympiad', 0.5853327512741089)]

In [14]:
model.wv.most_similar('article')

[(u'paper', 0.9212037324905396),
 (u'chapter', 0.7979187965393066),
 (u'thispaper', 0.7398337721824646),
 (u'manuscript', 0.7245320677757263),
 (u'poster', 0.6871957182884216),
 (u'thesis', 0.686647891998291),
 (u'dissertation', 0.6739978790283203),
 (u'springerbrief', 0.6531702876091003),
 (u'work', 0.6530567407608032),
 (u'essay', 0.6500864028930664)]

Serialize trained model

In [15]:
from time import time

In [16]:
dir_models = path + 'word2vec_models/'
f_last_model = dir_models + 'abstract_model_%d' % time()
model.save(f_last_model)