In [None]:
import pandas as pd
import numpy as np
import os
import re
import datetime
from datetime import datetime
from datetime import date
import pickle

### Read Metadata

In [None]:
metadata = pd.read_csv(r'../data/processed/metadata.csv')
metadata = metadata.drop('Unnamed: 0', axis = 1)
metadata.head(2)

In [None]:
metadata['paper_id'] = [x.split('/')[1] for x in metadata['paper_id']]
metadata['paper_id'] = metadata['paper_id'].astype('int')

### Submitter Related features

In [None]:
metadata['submitter'] = metadata['submitter'].str.lower()
metadata['submitter'] = metadata['submitter'].str.replace(r'[:%-]', '', regex = True)

def find_email(text):
    email = re.findall(r'[\w.+-+ ]+@[\w.-]*',str(text))
    return ",".join(email)

metadata['submitter_email'] = metadata['submitter'].apply(lambda x: find_email(x))
metadata['submitter_email'] = metadata['submitter_email'].str.replace(r' ','', regex = True)

In [None]:
metadata['submitter_name'] = metadata['submitter'].str.replace(r'[\w.+-+%]+@[\w-]+\.[\w.-]+', "", regex = True)
metadata['submitter_name'] = metadata['submitter_name'].str.replace(r'[\<([{})>\]"]', "", regex = True)

In [None]:
metadata['submitter_email'] = metadata['submitter_email'].fillna("")
metadata['submitter_name'] = metadata['submitter_name'].fillna("")

metadata = metadata.replace(r'^\s*$', np.nan, regex=True)
metadata['submitter_details'] = metadata['submitter_email'].mask(pd.isnull, metadata['submitter_name'])

In [None]:
metadata.head()

### Submission Date Related Features

In [None]:
metadata['submission_date'] = metadata['submission_date'].str.lower()
metadata['is_revised'] = metadata["submission_date"].map(lambda x: 1 if "revised" in x else 0)
metadata['times_revised'] = metadata["submission_date"].str.count('revised')

In [None]:
metadata['first_submission_datetime'] = [x.split('date (revised)')[0] for x in metadata['submission_date']]
metadata['first_submission_datetime'] = metadata['first_submission_datetime'].str.extract(r'((?:\d{,2}\s)?(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*(?:-|\.|\s|,)\s?\d{1,4}(?:-|\.|\s|,)\s?(?:\d{2}:\d{2}:\d{2}))')

In [None]:
one = metadata['first_submission_datetime'].str.extract(r'((?:\d{,2}\s)?(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*(?:-|\.|\s|,)\s?\d{1,4})')
two = metadata['first_submission_datetime'].str.extract(r'((?:\d{1,2})(?:(?:\/|-)\d{1,2})(?:(?:\/|-)\d{2,4}))')
final = one.fillna(two)
metadata['first_submission_date'] = final

In [None]:
diff_date_types = []
for i in range(len(metadata.first_submission_date)):
  try:
    metadata['first_submission_datetime'][i] = pd.to_datetime(metadata['first_submission_datetime'][i])
    metadata['first_submission_date'][i] = pd.to_datetime(metadata['first_submission_date'][i]).date()
  except:
    diff_date_types.append(i)


### Comment related features

In [None]:
metadata["comments"]= metadata["comments"].str.lower()

def page_num_extract(line):
  pages = re.findall("(\d* pages)|(\d* pp\.)|(\d*pp\.)|(\d*p\.)|(\d+pp)|(\d pp)|(\d*pages)|(pp\. \d*)|(pp\.\d*)", str(line))
  mul_pages = []
  for i in pages:
     z = re.findall("\d*", str(i))
     z = list(filter(None, z))
     mul_pages.append(z)
     pages = [int(item) for sublist in mul_pages for item in sublist]

  if not pages:
    pages = np.nan
  else:
    pages = sum(pages)
  return pages

metadata['num_of_pages'] = metadata['comments'].apply(lambda x: page_num_extract(x))

In [None]:
metadata['format'] = ""
def find_format(text):
    format = re.findall("latex|harvmac|revtex|plaintex|plain tex|levtex", str(text))
    return ",".join(format)
metadata['format'] = metadata['comments'].apply(lambda x: find_format(x))

In [None]:
metadata.head(2)

### Author Related Features

In [None]:
metadata['authors'] = metadata['authors'].str.replace(r"\([^()]*\)", "", regex = True)
metadata['authors'] = metadata['authors'].str.replace(r"[^ a-zA-Z0-9.,-]+", '', regex = True)
metadata['authors'] = metadata['authors'].str.replace(r' and', ",", regex = True)

### Journal Related Features

In [None]:
for i in range(len(metadata['journal_ref'])):
  try:
    metadata['journal_ref'][i] = re.findall(r'[\w\d\W]*?\(\d*\)',metadata['journal_ref'][i])[0]
  except:
    ''
  

In [None]:
metadata['journal_counts'] = metadata.groupby(['journal_ref'])['journal_ref'].transform('count')
metadata.head()

In [None]:
metadata.to_csv(r'../data/processed/metadata_features.csv')