# Dummy pipeline for classifying citations

In [1]:
## Imports
import pandas as pd
import numpy as np
from collections import Counter
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Immutables
file = 'unarXive_sample/context_centered_sample/citation_context_sample.csv'
punctuations = list(string.punctuation)
keywords = ['CIT', 'MAINCIT', 'REF', 'FORMULA']
filtr = punctuations + keywords

## Functions

In [4]:
def readCitations(file, ret='all'):
    ''' Open csv with citations, choose what to return:
            - a dict with all citation contexts for every cited paper
            - the corresponding arxiv and mag ids
            - the csv as dataframe
            - all three of the above
        Based on MAG ID since arXiv ID seems to be less complete
    '''
    columns = ['cited_paper_mag_id',
               'adjacent_citations_mag_ids',
               'citing_paper_mag_id',
               'cited_paper_arxiv_id',
               'adjacent_citations_arxiv_ids',
               'citing_paper_arxiv_id', 
               'citation_context']
    
    cit = pd.read_csv(file, sep='\u241E', encoding='utf-8', engine='python', names=columns)
    
    keys = list(set(cit['cited_paper_mag_id']))
    context_dict = {k:[] for k in keys}
    mag2arx_dict = {k:None for k in keys}

    for i in range(len(cit)):
        # Main vars -- Later add adjacent citations?
        mag = cit['cited_paper_mag_id'][i]
        arx = cit['cited_paper_arxiv_id'][i]
        citcon = cit['citation_context'][i]

        # Add context to dictionary
        temp_list = context_dict[mag]
        temp_list.append(citcon)
        context_dict[mag] = temp_list

        # Add arXiv ID if available
        mag2arx_dict[mag] = arx
    
    if ret == 'cit':
        return cit
    elif ret == 'con':
        return context_dict
    elif ret == 'm2a':
        return mag2arx_dict
    else:
        return cit, context_dict, mag2arx_dict

## Create bag of words model with CountVectorizer ##
def bowVector(vocab, corpus):
    vectorizer = CountVectorizer(vocabulary=vocab)
    X = vectorizer.fit_transform(corpus)
    wordvec = X.toarray()
    vec_vocab = vectorizer.get_feature_names_out()
    
    return X, wordvec, vec_vocab

## Data

In [5]:
cit, context_dict, mag2arx_dict = readCitations(file, 'all')

In [10]:
## Filter out punctuation and weird words
citcon_ftd = [[word.lower() for word in nltk.word_tokenize(ct) if word not in filtr] for ct in cit['citation_context']]
## Corpus with all sentences
citcon_sent = [' '.join(citcon) for citcon in citcon_ftd] 

# Filtered bag with all words
# bag = [word.lower() for word in nltk.word_tokenize(ct) for ct in citcon_sent if word not in filtr]
# for ct in cit['citation_context']:
#     bag += [word.lower() for word in nltk.word_tokenize(ct) if word not in filtr]
# bag_counter = Counter(bag)

## Create subset of data
sub_citcon = citcon_sent[0:2]
sub_bag = set([word.lower() for word in nltk.word_tokenize(ct) if word not in filtr for ct in sub_citcon])


NameError: name 'ct' is not defined

In [7]:

# X, wordvec, vec_vocab = bowVector(set(bag), citcon_sent)

In [None]:
X
# [print(w, w in bag) for w in list(scikit_words)]
# [print(w, w in scikit_words) for w in bag]
# checkbag = []
# for s in citcon_sent[0:5]:
#     words = nltk.word_tokenize(s)
#     for w in words:
#         if w not in checkbag:
#             checkbag.append(w)

In [None]:
list(scikit_words)

In [None]:
columns = ['cited_paper_mag_id','adjacent_citations_mag_ids','citing_paper_mag_id','cited_paper_arxiv_id','adjacent_citations_arxiv_ids','citing_paper_arxiv_id', 'citation_context']
sample = pd.read_csv('unarXive_sample/context_centered_sample/citation_context_sample.csv', sep='\u241E', encoding='utf-8', engine='python', names=columns)

cited_papers = [(sample['cited_paper_arxiv_id'][i], sample['cited_paper_mag_id'][i]) for i in range(len(sample))]

## Dictionary creation

keys = list(set(sample['cited_paper_mag_id']))
context_dict = {k:[] for k in keys}
mag2arx_dict = {k:None for k in keys}

for i in range(len(sample)):
    # Main vars -- Later add adjacent citations?
    mag = sample['cited_paper_mag_id'][i]
    arx = sample['cited_paper_arxiv_id'][i]
    citcon = sample['citation_context'][i]
    
    # Add context to dictionary
    temp_list = context_dict[mag]
    temp_list.append(citcon)
    context_dict[mag] = temp_list
    
    # Add arXiv ID if available
    mag2arx_dict[mag] = arx

In [None]:
# [print(k, v) for k, v in context_dict.items()]