## Load Arxiv to dataframes

In [None]:
import logging

import pandas as pd
import numpy as np

from keypaper.arxiv_loader import ArxivLoader
from keypaper.arxiv_analyzer import ArxivAnalyzer
from keypaper.config import PubtrendsConfig
import html

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
%matplotlib inline

In [None]:
config = PubtrendsConfig(test=False)
analyzer = ArxivAnalyzer(ArxivLoader(config))
SEARCH_TERMS = []
log = analyzer.launch(*SEARCH_TERMS)

In [None]:
analyzer.df['total'].hist(bins=[0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50])

In [None]:
arxiv_df_not_null = analyzer.df[analyzer.df.total > 0]
arxiv_df_null = analyzer.df[analyzer.df.total == 0].head(n=10000)
arxiv_df = pd.concat([arxiv_df_not_null, arxiv_df_null]).drop(columns=['crc32id', 'aux'])

In [None]:
arxiv_df['total'].hist(bins=[0, 5, 10, 20, 30, 40, 50])

Filter only 1975-2015 years

In [None]:
arxiv_df_years = arxiv_df[arxiv_df.year <= 2015][1975 <= arxiv_df.year]

### Check authors 

In [None]:
from keypaper.utils import split_df_list
def popular_authors(df, n=20, current=0, task=None):
    author_stats = df[['authors']].copy()
    author_stats['authors'].replace({'': np.nan, -1: np.nan}, inplace=True)
    author_stats.dropna(subset=['authors'], inplace=True)

    author_stats = split_df_list(author_stats, target_column='authors', separator=', ')
    author_stats.rename(columns={'authors': 'author'}, inplace=True)

    author_stats = author_stats.groupby(['author']).size().reset_index(name='counts')

    author_stats = author_stats.groupby('author').agg({'counts': ['sum']}).reset_index()

    author_stats.columns = author_stats.columns.droplevel(level=1)
    author_stats.columns = ['author', 'sum']
    author_stats = author_stats.sort_values(by=['sum'], ascending=False)

    return author_stats.head(n=n)

def popular_journals(df, n=20):
    journal_stats = df.groupby(['journal']).size().reset_index(name='counts')
    
    journal_stats['journal'].replace('', np.nan, inplace=True)
    journal_stats.dropna(subset=['journal'], inplace=True)

    journal_stats = journal_stats.groupby('journal').agg({'counts': ['sum']}).reset_index()

    journal_stats.columns = journal_stats.columns.droplevel(level=1)
    journal_stats.columns = ['journal', 'sum']

    journal_stats = journal_stats.sort_values(by=['sum'], ascending=False)

    return journal_stats.head(n=n)

In [None]:
authors = popular_authors(arxiv_df, n=30000)[['author', 'sum']]

In [None]:
authors.tail()

In [None]:
authors['author'] = authors['author'].apply(lambda author: author.lower().replace('.', ''))
authors.head()

In [None]:
authors_names = pd.read_csv('authorsNames.tsv', sep='\t')
authors_features = pd.read_csv('authorsFeatures.tsv', sep='\t')

In [None]:
authors_features.head()

In [None]:
authors_year = popular_authors(arxiv_df_years, n=30000)[['author', 'sum']]
authors_year['author'] = authors_year['author'].apply(lambda author: author.lower().replace('.', ''))

In [None]:
merged_authors_year = pd.merge(authors_year, authors_names, on='author', how='inner')

In [None]:
print(f"There are {merged_authors_year.shape[0]} authors in file out of {authors_year.shape[0]} most popular")

### Check how many papers from Arxiv (1975-2015) have at least one author, that contains in file

In [None]:
arxiv_df_years = arxiv_df[arxiv_df.year <= 2015][1975 <= arxiv_df.year]

In [None]:
set_authors = set(authors_names['author'])

In [None]:
def test_author_detection(row):
    authors_list = row['authors'].split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]
        
    is_detected = list(map(lambda author: True if author.lower().replace('.', '') in set_authors else False,
                                    authors_list))
    return is_detected

In [None]:
def count_any(row):
    return any(row.detected_authors)

In [None]:
arxiv_df_years['detected_authors'] = arxiv_df_years.apply(lambda row: test_author_detection(row), axis=1)
arxiv_df_years['any_detected'] = arxiv_df_years.apply(lambda row: count_any(row), axis=1)

In [None]:
n_papers_with_detected_authors = arxiv_df_years[arxiv_df_years['any_detected']].shape[0]
n_papers = arxiv_df_years.shape[0]
print(f'Found {n_papers_with_detected_authors} papers with detected authors (at least one of them) out of {n_papers} papers')

In [None]:
authors_features.describe()

In [None]:
authors_features['name'] = authors_names
authors_features.set_index('name', inplace=True)

In [None]:
features_of_needed_authors = authors_features[authors_features.index.isin(merged_authors_year.author)]

In [None]:
features_dict = features_of_needed_authors.to_dict('index')

### Add topics to dataframe

In [None]:
from keypaper.utils import lda_subtopics, explain_lda_subtopics
n_topics = 100
topic_names = [f'topic{i}' for i in range(n_topics)]
topics, lda, vectorizer = lda_subtopics(arxiv_df_years, n_words=1000, n_topics=n_topics)

In [None]:
topics_df = pd.DataFrame(data=topics, columns=topic_names) 

In [None]:
arxiv_df_years = pd.concat([arxiv_df_years.reset_index(), topics_df.reset_index()], axis=1)

In [None]:
explanations = explain_lda_subtopics(lda, vectorizer, n_top_words=20)

In [None]:
explanations

In [None]:
topic_cits = []
for i in range(n_topics):
    topic_cits.append(np.dot(arxiv_df_years[f'topic{i}'], arxiv_df_years['total']))

### Preprocessing

In [None]:
from statistics import mean

In [None]:
def citations_after_n_years(row, n):
    paper_year = row['year']
    cit = 0
    for cur_year in range(paper_year, paper_year + n):
        if cur_year in row:
            cit += row[cur_year]
    return cit

In [None]:
def journal_preprocess(df):
    journals = popular_journals(df, n=10000)[['journal', 'sum']]
    
    journals_citations = df[['journal', 'total']].groupby(['journal'])\
            .agg({'total': 'sum'}).reset_index().rename(columns={'total':'journal_citations'})
    return journals, journals_citations

In [None]:
journals, journals_citations = journal_preprocess(arxiv_df_years)

In [None]:
def authors_preprocess(df):
    authors = popular_authors(df, n=10000)[['author', 'sum']]
    authors_papers_dict = authors.set_index('author')['sum'].to_dict()
    
    author_total = df[['authors', 'total']]
    author_total['authors'].replace({'': np.nan, -1: np.nan}, inplace=True)
    author_total.dropna(subset=['authors'], inplace=True)
    author_total = split_df_list(author_total, target_column='authors', separator=', ')
    authors_citations = author_total.groupby(['authors']).agg({'total': 'sum'}).reset_index()
    authors_citations = authors_citations.loc[authors_citations['authors'] != ''].rename(columns={'authors':'author'})
    authors_dict = authors_citations.set_index('author')['total'].to_dict()
    return authors_papers_dict, authors_dict

In [None]:
authors_papers_dict, authors_dict = authors_preprocess(arxiv_df_years)

In [None]:
def get_authors_papers(authors_str):
    authors_list = authors_str.split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]
    papers_of_given_authors = list(map(lambda author: authors_papers_dict[author] 
                                    if author in authors_papers_dict else 1,
                                   authors_list))

    return pd.Series([mean(papers_of_given_authors), max(papers_of_given_authors)])

In [None]:
def get_authors_citations(authors_str):
    authors_list = authors_str.split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]
    cit_of_given_authors = list(map(lambda author: authors_dict[author] if author in authors_dict else 1,
                                    authors_list))

    return pd.Series([mean(cit_of_given_authors), max(cit_of_given_authors)])

In [None]:
def count_diversity(row):
    return np.dot(list(row[[f'topic{i}'for i in range(n_topics)]]),
           np.log(list(row[[f'topic{i}'for i in range(n_topics)]])))

In [None]:
def count_citations_on_topics(row):
    return np.dot(list(row[[f'topic{i}'for i in range(n_topics)]]), topic_cits)

In [None]:
def add_all_authors_features(row):
    if not row['any_detected']:
        return row
    
    authors_list = row['authors'].split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]
    authors_list = list(map(lambda author: author.lower().replace('.', ''), authors_list))
        
    # should be authors_features.todict() instead of features_dict but it is already too slow
    features_of_paper_authors = pd.DataFrame(filter(lambda x: x is not None, 
                                                    map(lambda author: features_dict[author] 
                                                        if author in features_dict else None, authors_list)))
    for feature in features_of_paper_authors:
        row[feature + '_mean'] = features_of_paper_authors[feature].mean()
        row[feature + '_min'] = features_of_paper_authors[feature].min()
        row[feature + '_max'] = features_of_paper_authors[feature].max()
    return row

In [None]:
def preprocess(df2, step=5, current_year=2019):
    df = df2.copy()
    df['n_authors'] = df['authors'].apply(lambda authors: len(authors.split(', ')))
    df['year'] = df['year'].astype(int)
    df['recency'] = current_year - df['year']
    df['title_len'] = df['title'].apply(lambda title: 0 if pd.isnull(title) else len(title))
    df['abstract_len'] = df['abstract'].apply(lambda abstract: 0 if pd.isnull(abstract) else len(abstract))
    min_year, max_year = analyzer.min_year, analyzer.max_year
    
#   early citations (cumulative)
    for i in range(1, step + 2):
        feature_name = f'c{i}'
        df[feature_name] = df.apply(lambda row: citations_after_n_years(row, n=i), axis=1)
    
#   early citations (deltas)
    for i in range(1, step + 1):
        df[f'd{i}'] = df[f'c{i + 1}'] - df[f'c{i}'] 
        
    logging.info("Done counting early citations")
    
#   how many papers were published by author/journal that published this paper 
    df = pd.merge(df, journals, on='journal', how='left').rename(columns={'sum':'journal_papers'})
    logging.info("Done counting how many papers were published by the journal")
    
    df['author_mean_papers'], df['author_max_papers'] = df['authors']\
        .apply(lambda authors: get_authors_papers(authors))
    
    logging.info("Done counting how many papers were published by each of the authors")

#   how many citations journal that published this paper has
    df = pd.merge(df, journals_citations, on='journal', how='left')
    
    logging.info("Done counting how cited each journal was")

#   how many citations author of this paper got (mean/max)
    df['author_mean_citations'], df['author_max_citations'] = df['authors']\
        .apply(lambda authors: get_authors_citations(authors))
    logging.info("Done counting how cited each author was")
    
#   topics
    df['diversity'] = df.apply(lambda row: count_diversity(row), axis=1)
    df['citations_on_topics'] = df.apply(lambda row: count_citations_on_topics(row), axis=1)
    logging.info("Done adding features based on topics")
    
    return df

In [None]:
preprocessed_arxiv_df_years = preprocess(arxiv_df_years)

Add ~250 features about authors and venues based on information from file

In [None]:
final_author_features = []
for feature in author_columns:
    final_author_features += [feature + '_mean', feature + '_min', feature + '_max']

In [None]:
# now it is super slow
authors_features_to_add = arxiv_df_years[['any_detected', 'authors']]\
                            .apply(lambda row: add_all_authors_features(row), axis=1)

In [None]:
authors_features_to_add

In [None]:
preprocessed_arxiv_df_years = pd.concat([authors_features_to_add.drop(columns=['any_detected', 'authors']), 
                                         preprocessed_arxiv_df_years], axis=1)

In [None]:
preprocessed_arxiv_df_years.describe()

## Models with different features and target

In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV

In [None]:
step = 5

In [None]:
def predict(df, features, target, model=LinearRegression(), dropna=False):
    if dropna:
        df.dropna(inplace=True)
    else:
        df.fillna(0, inplace=True)
    train_validate = df[df.recency > step + 1][features + [target]]
    train = train_validate[train_validate.recency > 11] 
    validate = train_validate[train_validate.recency <= 11]
    
    X = train.iloc[:,:-1]
    y = train.iloc[:,-1]
    X_validate = validate.iloc[:,:-1]
    y_validate = validate.iloc[:,-1]

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    X_validate = scaler.transform(X_validate)

    reg = model.fit(X, y)
    
    print(f"R^2 train: {reg.score(X, y)} validate: {reg.score(X_validate, y_validate)}")
    print(f"RMSE train: {sqrt(mse(reg.predict(X), y))} validate: {sqrt(mse(reg.predict(X_validate), y_validate))}")
    
    return reg

In [None]:
def print_top_influencers(df, features, reg):
    maxcoef = np.argsort(-np.abs(reg.coef_))
    coef = reg.coef_[maxcoef]
    for i in range(0, 10):
        print("{:.<060} {:< 010.4e}".format(df[features].columns[maxcoef[i]], coef[i]))

### 1. Predict c6 given c1-c5

 'citations_on_topics' is linear combination of topic distribution ('topic_names' columns)

In [None]:
basic_features = ['diversity', 'n_authors', 'recency', 'title_len', 'abstract_len', 'citations_on_topics']  # + topic_names
# calculated
authors_journals = ['journal_papers','author_mean_papers', 'author_max_papers', 
                      'journal_citations', 'author_mean_citations', 'author_max_citations']

basic_features += authors_journals
features = (basic_features +['c1', 'c2', 'c3', 'c4', 'c5'])
target = 'c6'
reg = predict(preprocessed_arxiv_df_years, features, target)

In [None]:
print_top_influencers(preprocessed_arxiv_df_years, features, reg)

In [None]:
# x = []
# for a, b in zip(list(y_validate), list(reg.predict(X_validate))):
#     if a != 0:
#         x.append(b - a)
# x = temp
# plt.hist(x, bins=50)

### 2. Predict d5 given c1, d1-d4

In [None]:
features = basic_features + ['c1', 'd1', 'd2', 'd3', 'd4'] 
features += authors_journals
target = 'd5'
reg = predict(preprocessed_arxiv_df_years, features, target)

In [None]:
print_top_influencers(preprocessed_arxiv_df_years, features, reg)

### 3. Predict c_i without any early citations info


In [None]:
for i in range(1, 6):
    print('\n' + f"predict citations after {i} years after paper was published")
    features = basic_features + authors_journals
    target = f'c{i}'
    reg = predict(preprocessed_arxiv_df_years, features, target)
    print_top_influencers(preprocessed_arxiv_df_years, features, reg)

Or with regularisation (L1 or L2)

In [None]:
print(f"Lasso regularisation; target {target}")
Ls = predict(preprocessed_arxiv_df_years, features, target, model=LassoCV(cv=5))
print_top_influencers(preprocessed_arxiv_df_years, features, Ls)

In [None]:
print(f"Ridge regularisation; target {target}")
Rr = predict(preprocessed_arxiv_df_years, features, target, model=RidgeCV(cv=5))
print_top_influencers(preprocessed_arxiv_df_years, features, Rr)

Or using all features extracted from file about authors and venue

In [None]:
for i in range(1, 6):
    print('\n' + f"predict citations after {i} years after paper was published")
    features = basic_features + final_author_features
    target = f'c{i}'
    reg = predict(preprocessed_arxiv_df_years, features, target, LassoCV(cv=5, tol=0.1), dropna=True)
    print_top_influencers(preprocessed_arxiv_df_years, features, reg)