In [15]:
import pandas as pd
import requests, json
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from time import sleep

## Statistics count of scraped papers

In [None]:
allpapers = pd.read_csv('all_words.csv')
allpapers.drop(allpapers.columns[[0]], axis=1, inplace=True)
print('N of papers in total:', len(allpapers))
allpapers.head(3)

In [None]:
# Plot distribution of citationCount

filtered_access = allpapers[allpapers['isOpenAccess'] == True]

plt.hist(filtered_access['citationCount'], bins=50, range=(0, 700), color='skyblue', edgecolor='black')
plt.title('Distribution of citationCount')
plt.xlabel('citationCount')
plt.ylabel('Frequency')
plt.xlim(0, 700) 
plt.show()

In [None]:
# how many papers with more than 50 citations have accesible pdf? 101k
filtered_count = allpapers[(allpapers['citationCount'] > 50)]
filtered_count['isOpenAccess'].value_counts()

## Preparing data for PDF vesions of papers

In [None]:
# filtered_df is papers with citations more than 10 and open pdf access (double check that link is not nan): 90k
filtered_df = filtered_count[(filtered_count['isOpenAccess'] == True) & (~allpapers['openAccessPdf'].isna())]
print('N paper with citations >50 and url access:', len(filtered_df))

In [None]:
filtered_df['pdf_url'] = filtered_df['openAccessPdf'].apply(lambda x: eval(x)['url'])
filtered_df['downloaded'] = False
filtered_df.to_csv('papers_filtered_citation_access.csv')

In [None]:
filtered_df['pdf_url'].values[:10]

In [None]:
filtered_df.columns

## Downloading PDFs

In [None]:
df = pd.read_csv('papers_filtered_citation_access.csv')
df.drop(df.columns[[0]], axis=1, inplace=True)
len(df)

In [None]:
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(lambda x: x.strip("[]").replace("'", "").split(", ") if isinstance(x, str) else x)

unique_fields = set()

for field_entry in df['fieldsOfStudy']:
    if isinstance(field_entry, list):
        unique_fields.update(field_entry)
unique_fields_list = list(unique_fields)

print('How many unique fields are there:', len(unique_fields_list))
unique_fields_list

In [None]:
df['fieldsOfStudy'].values[:10]

## Creating table w/ papers for manual evaluation and extraction

In [None]:
papers_manual_list = []

for field in unique_fields_list:
    # Filter the DataFrame by the current field of study
    filtered_df = df[df['fieldsOfStudy'].apply(lambda x: isinstance(x, list) and field in x)]
    
    # Select the top 700 rows for each field of study
    filtered_df = filtered_df.sort_values(by='citationCount', ascending=False)
    filtered_df = filtered_df.head(700)
    papers_manual_list.append(filtered_df)

papers_manual = pd.concat(papers_manual_list)
papers_manual.reset_index(drop=True, inplace=True)

In [None]:
papers_manual.to_csv('papers_manual.csv')

# Scraping all the PDFs

In [None]:
df = pd.read_csv('papers_filtered_citation_access.csv')
df.drop(df.columns[[0]], axis=1, inplace=True)
len(df)

In [None]:
scraped_ids = []

with open("manual_ids.txt", "r") as file:
   for line in file.readlines():
       scraped_ids.append(line.replace("\n","").lower())

with open("downloaded_ids.txt", "r") as file:
   for line in file.readlines():
       scraped_ids.append(line.replace("\n","").lower())   

len(scraped_ids)

In [22]:
for index, row in df.iterrows():
    if row['paperId'] in scraped_ids:
        df.at[index, 'downloaded'] = True

In [None]:
download_dir = 'downloaded_pdfs'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

for index, row in tqdm(df[['pdf_url', 'paperId']].iterrows(), total=len(df), desc="Downloading PDFs"):
    url = row['pdf_url']
    paper_id = row['paperId']
    try:
        if paper_id in scraped_ids:
            continue

        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code == 200:
            # Extract filename from URL
            filename = os.path.join(download_dir, f"paper_{paper_id}.pdf")
            # Write the content to a PDF file
            with open(filename, 'wb') as f:
                f.write(response.content)
            df.at[index, 'downloaded'] = True
        elif response.status_code:
            df.at[index, 'downloaded'] = bool('Error')
        else:
            print(f"An error occurred on row {index}: unknown error.")
            df.at[index, 'downloaded'] = bool('Error')
    except requests.Timeout:
        print(f"Request to {url} timed out.")
    except Exception as e:
        print(f"An error occurred on row {index} with URL {url}: {e}")
    finally:
        continue