In [1]:
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from collections import defaultdict
from multiprocessing import  Pool
import numpy as np
import swifter
import pandas
from pandarallel import pandarallel

In [2]:
FILE_ORIGIN = '../dblpv13.json'
FILE_PREPROCESSED = '../data.json'

In [3]:
CHUNKSIZE = 100000
MAX_OBSERVATIONS = 1000000 #Set to 55000000 if want all rows
df = pd.DataFrame()
with pd.read_json(FILE_PREPROCESSED, orient='records', lines=True, chunksize=CHUNKSIZE, nrows=MAX_OBSERVATIONS) as reader:
    for chunk_i, chunk in tqdm_notebook(enumerate(reader), total=MAX_OBSERVATIONS // CHUNKSIZE):
        df_new = pd.DataFrame(chunk)
        df_new['chunk_idx'] = chunk_i
        df = pd.concat([df, df_new], ignore_index=True)

  0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
df.head()

Unnamed: 0,_id,title,venue,year,keywords,n_citation,lang,authors,fos,page_start,...,volume,issue,issn,isbn,doi,pdf,url,abstract,references,chunk_idx
0,53e99784b7602d9701f3e3f5,3GIO.,{'type': 0},2011.0,[],0.0,en,,,,...,,,,,,,,,,0
1,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,"{'_id': '53a7297d20f7420be8bd4ae7', 'name_d': ...",2011.0,"[canopy parameters, canopy spectrum, different...",0.0,en,"[{'_id': '53f45728dabfaec09f209538', 'name': '...","[Agronomy, Moisture, Hydrology, Environmental ...",1930.0,...,,,,,10.1109/IGARSS.2011.6049503,,[http://dx.doi.org/10.1109/IGARSS.2011.6049503],Drought is the first place in all the natural ...,,0
2,53e99784b7602d9701f3e151,A solution to the problem of touching and brok...,"{'_id': '53a72a4920f7420be8bfa51b', 'name_d': ...",1993.0,"[handwriting recognition, prototypes, image se...",17.0,en,"[{'_id': '53f46797dabfaeb22f542630', 'name': '...","[Intelligent character recognition, Pattern re...",602.0,...,,,,,10.1109/ICDAR.1993.395663,,[http://dx.doi.org/10.1109/ICDAR.1993.395663],,"[53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990c...",0
3,53e99784b7602d9701f3e15d,Timing yield estimation using statistical stat...,"{'_id': '53a72e2020f7420be8c80142', 'name_d': ...",2005.0,"[sequential circuits, statistical distribution...",28.0,en,"[{'_id': '53f43b03dabfaedce555bf2a', 'name': '...","[Delay calculation, Timing failure, Monte Carl...",2461.0,...,,,,0-7803-8834-8,10.1109/ISCAS.2005.1465124,//static.aminer.org/pdf/PDF/000/423/329/timing...,"[http://dx.doi.org/10.1109/ISCAS.2005.1465124,...",As process variations become a significant pro...,"[53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27...",0
4,53e99784b7602d9701f3e161,360°,"{'_id': '5390a74a20f70186a0e8b40b', 'raw': 'AC...",2009.0,"[global high technology, daily short-distance ...",,en,"[{'_id': '53f46946dabfaec09f24b4ed', 'name': '...",,39.0,...,,,,,10.1145/1665137.1665166,,,360° represents the concerns that are addresse...,,0


In [5]:
articles_cited = defaultdict(int)
for val in df.references:
    if isinstance(val, list):
        for art in val:
            articles_cited[art] += 1

In [6]:
df_cited = pd.DataFrame([item for item in articles_cited.items()], columns=['_id', 'n_cited'])
df_cited.head()

Unnamed: 0,_id,n_cited
0,53e99cf5b7602d97025ace63,10
1,557e8a7a6fee0fe990caa63d,35
2,53e9a96cb7602d97032c459a,156
3,53e9b929b7602d9704515791,24
4,557e59ebf6678c77ea222447,30


In [7]:
df = pd.merge(df, df_cited, on='_id', how='left')

In [8]:
df.head()

Unnamed: 0,_id,title,venue,year,keywords,n_citation,lang,authors,fos,page_start,...,issue,issn,isbn,doi,pdf,url,abstract,references,chunk_idx,n_cited
0,53e99784b7602d9701f3e3f5,3GIO.,{'type': 0},2011.0,[],0.0,en,,,,...,,,,,,,,,0,
1,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,"{'_id': '53a7297d20f7420be8bd4ae7', 'name_d': ...",2011.0,"[canopy parameters, canopy spectrum, different...",0.0,en,"[{'_id': '53f45728dabfaec09f209538', 'name': '...","[Agronomy, Moisture, Hydrology, Environmental ...",1930.0,...,,,,10.1109/IGARSS.2011.6049503,,[http://dx.doi.org/10.1109/IGARSS.2011.6049503],Drought is the first place in all the natural ...,,0,
2,53e99784b7602d9701f3e151,A solution to the problem of touching and brok...,"{'_id': '53a72a4920f7420be8bfa51b', 'name_d': ...",1993.0,"[handwriting recognition, prototypes, image se...",17.0,en,"[{'_id': '53f46797dabfaeb22f542630', 'name': '...","[Intelligent character recognition, Pattern re...",602.0,...,,,,10.1109/ICDAR.1993.395663,,[http://dx.doi.org/10.1109/ICDAR.1993.395663],,"[53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990c...",0,1.0
3,53e99784b7602d9701f3e15d,Timing yield estimation using statistical stat...,"{'_id': '53a72e2020f7420be8c80142', 'name_d': ...",2005.0,"[sequential circuits, statistical distribution...",28.0,en,"[{'_id': '53f43b03dabfaedce555bf2a', 'name': '...","[Delay calculation, Timing failure, Monte Carl...",2461.0,...,,,0-7803-8834-8,10.1109/ISCAS.2005.1465124,//static.aminer.org/pdf/PDF/000/423/329/timing...,"[http://dx.doi.org/10.1109/ISCAS.2005.1465124,...",As process variations become a significant pro...,"[53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27...",0,2.0
4,53e99784b7602d9701f3e161,360°,"{'_id': '5390a74a20f70186a0e8b40b', 'raw': 'AC...",2009.0,"[global high technology, daily short-distance ...",,en,"[{'_id': '53f46946dabfaec09f24b4ed', 'name': '...",,39.0,...,,,,10.1145/1665137.1665166,,,360° represents the concerns that are addresse...,,0,


In [9]:
import pickle
from model import Model
from coauthors_model import CoAuthors

with open('model', 'rb') as picklefile:
    clustering = pickle.load(picklefile)

with open('coauthors_model', 'rb') as picklefile:
    co_authors = pickle.load(picklefile)

In [10]:
df['title_abst'] = df['title'] + ' ' + df['abstract']

In [18]:
def get_topic(x):
    if x is None or not isinstance(x, str) or len(x) < 2:
        return -1
    else:
        res = clustering.predict([x])
        return res[0][0]

In [19]:
pandarallel.initialize(progress_bar=True)
df['topic'] = df.title.parallel_apply(get_topic)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=100000), Label(value='0 / 100000')…

In [20]:
df.head()

Unnamed: 0,_id,title,venue,year,keywords,n_citation,lang,authors,fos,page_start,...,isbn,doi,pdf,url,abstract,references,chunk_idx,n_cited,title_abst,topic
0,53e99784b7602d9701f3e3f5,3GIO.,{'type': 0},2011.0,[],0.0,en,,,,...,,,,,,,0,,,0
1,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,"{'_id': '53a7297d20f7420be8bd4ae7', 'name_d': ...",2011.0,"[canopy parameters, canopy spectrum, different...",0.0,en,"[{'_id': '53f45728dabfaec09f209538', 'name': '...","[Agronomy, Moisture, Hydrology, Environmental ...",1930.0,...,,10.1109/IGARSS.2011.6049503,,[http://dx.doi.org/10.1109/IGARSS.2011.6049503],Drought is the first place in all the natural ...,,0,,The relationship between canopy parameters and...,6
2,53e99784b7602d9701f3e151,A solution to the problem of touching and brok...,"{'_id': '53a72a4920f7420be8bfa51b', 'name_d': ...",1993.0,"[handwriting recognition, prototypes, image se...",17.0,en,"[{'_id': '53f46797dabfaeb22f542630', 'name': '...","[Intelligent character recognition, Pattern re...",602.0,...,,10.1109/ICDAR.1993.395663,,[http://dx.doi.org/10.1109/ICDAR.1993.395663],,"[53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990c...",0,1.0,A solution to the problem of touching and brok...,8
3,53e99784b7602d9701f3e15d,Timing yield estimation using statistical stat...,"{'_id': '53a72e2020f7420be8c80142', 'name_d': ...",2005.0,"[sequential circuits, statistical distribution...",28.0,en,"[{'_id': '53f43b03dabfaedce555bf2a', 'name': '...","[Delay calculation, Timing failure, Monte Carl...",2461.0,...,0-7803-8834-8,10.1109/ISCAS.2005.1465124,//static.aminer.org/pdf/PDF/000/423/329/timing...,"[http://dx.doi.org/10.1109/ISCAS.2005.1465124,...",As process variations become a significant pro...,"[53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27...",0,2.0,Timing yield estimation using statistical stat...,22
4,53e99784b7602d9701f3e161,360°,"{'_id': '5390a74a20f70186a0e8b40b', 'raw': 'AC...",2009.0,"[global high technology, daily short-distance ...",,en,"[{'_id': '53f46946dabfaec09f24b4ed', 'name': '...",,39.0,...,,10.1145/1665137.1665166,,,360° represents the concerns that are addresse...,,0,,360° 360° represents the concerns that are add...,0


In [21]:
df[['_id', 'topic']].to_csv('topics.csv', index=False)

In [22]:
df.topic.value_counts()

 19    48949
 37    44810
 1     39064
 34    38606
 32    38111
 39    36549
 41    34617
 33    32959
 16    32771
 6     32391
 26    31413
 2     30446
 17    30079
 21    30075
 4     29607
 5     29500
 3     29236
 30    26567
 25    26357
 9     26194
 20    25060
 27    24917
 35    22353
 12    22250
 8     19621
 7     19190
 11    17604
 10    17275
 14    16674
 36    16483
 18    16231
 22    15215
 15    14464
 23    11769
 31    11600
 13    11288
 40    10940
 0      9979
 29     9251
 28     8979
 38     7494
 24     2958
-1       104
Name: topic, dtype: int64

In [58]:
authors_articles = defaultdict(set)
for row in df[['_id', 'authors']].itertuples():
    if isinstance(row.authors, list):
        for author in row.authors:
            if 'name' in author and author['name'] in authors:
                authors_articles[author['name']].add(row._1)

In [110]:
class RecommendationArticles:
    def __init__(self, clustering, co_authors, db, authors_articles, top_n):
        self.clustering = clustering
        self.co_authors = co_authors
        self.db = db
        self.authors_articles = authors_articles
        self.top_n = top_n
        self.columns = ['title', 'year', 'doi']

    def predict(self, text=None, author=None):
        if text and author:
            result = self._get_prediction_text_author(text, author, self.top_n, self.db)
            if len(result) < self.top_n:
                if len(result) == 0:
                    new_db = self.db
                else:
                    new_db = self.db[~self.db._id.isin(result._id)]
                result_text = self._get_prediction_text(text, self.top_n - len(result), new_db)
                result = pd.concat([result, result_text], ignore_index=True)
        elif text:
            result = self._get_prediction_text(text, self.top_n, self.db)
        else:
            result = self._get_prediction_author(text, self.top_n, self.db)
        if result.empty:
            return result.to_json(orient='records')
        return result[self.columns].to_json(orient='records')
                
    def _get_prediction_text_author(self, text, author, top_n, db):
        topic = self.clustering.predict([text])[0][0]
        co_authors = self.co_authors.predict(author)
        result = pd.DataFrame()
        if co_authors and topic:
            for co_author in co_authors:
                if co_author in self.authors_articles:
                    temp_df = db.loc[db._id.isin(self.authors_articles[co_author]) & db.topic.eq(topic)]
                    result = pd.concat([result, temp_df], ignore_index=True)
        if result.empty:
            return result
        return result.nlargest(top_n, 'n_cited')

    def _get_prediction_text(self, text, top_n, db):
        topic = self.clustering.predict([text])[0][0]
        result = db.loc[db.topic.eq(topic)]
        if result.empty:
            return result
        return result.nlargest(top_n, 'n_cited')

    def _get_prediction_author(self, author, top_n, db):
        co_authors = self.co_authors.predict(author)
        result = pd.DataFrame()
        for co_author in co_authors:
            if co_author in self.authors_articles:
                temp_df = db.loc[db._id.isin(self.authors_articles[co_author])]
                result = pd.concat([result, temp_df], ignore_index=True)
        if result.empty:
            return result
        return result.nlargest(top_n, 'n_cited')

In [111]:
recommendation_articles = RecommendationArticles(clustering, co_authors, df[['_id', 'title', 'year', 'doi', 'n_cited', 'topic']], authors_articles, 10)

In [112]:
recommendation_articles.predict('Machine Learning, Deep LEarning', 'Yunquan Zhang')

'[{"title":"The Application of AE Signal in Early Cracked Rotor Fault Diagnosis with PWVD and SVM.","year":2011.0,"doi":"10.4304\\/jsw.6.10.1969-1976"},{"title":"Learning, indexing, and diagnosing network faults","year":2009.0,"doi":"10.1145\\/1557019.1557113"},{"title":"Reinforcement Learning: An Introduction","year":1998.0,"doi":"10.1109\\/TNN.1998.712192"},{"title":"Reinforcement learning: a survey","year":1996.0,"doi":"10.1613\\/jair.301"},{"title":"Image retrieval: Ideas, influences, and trends of the new age","year":2008.0,"doi":"10.1145\\/1348246.1348248"},{"title":"Technical Note Q-Learning","year":1992.0,"doi":"10.1023\\/A:1022676722315"},{"title":"Goal-directed requirements acquisition","year":1993.0,"doi":"10.1016\\/0167-6423(93)90021-G"},{"title":"Reflections on NoteCards: seven issues for the next generation of hypermedia systems","year":2001.0,"doi":"10.1145\\/507317.507321"},{"title":"On the Difference between Updating a Knowledge Base and Revising It","year":1991.0,"doi

In [113]:
import pickle
with open('articles_model', 'wb') as picklefile:
    pickle.dump(recommendation_articles, picklefile)