In [2]:
import requests
from bs4 import BeautifulSoup

In [50]:
from googlesearch import search, get_random_user_agent
from difflib import SequenceMatcher
from tqdm import tqdm
import time

def similarity(a, b):
    match = SequenceMatcher(None, a, b)
    return match.ratio()


def parse_url(url, title):
    if ".pdf" in url.split('/')[-1]:
        return url
    if "proceedings.neurips.cc" in url and url.endswith(".html"):
        return url[:-13].replace('hash', 'file') + "Paper.pdf"
    if "doi.org" or 'ieeexplore.ieee.org' in url:
        try:
            thepage = requests.get("https://sci-hub.ee/" + url)
            soup = BeautifulSoup(thepage.text, "html.parser")
            if soup.find('id=pdf'):
                pdf_link = soup.find(id='pdf').get("src")
            elif soup.find(type="application/x-google-chrome-pdf"):
                pdf_link = soup.find(type="application/x-google-chrome-pdf").get("src")
            if "http" not in pdf_link:
                return "https:" + pdf_link
            else:
                return pdf_link
        except Exception as e:
            print("Sci-Hub Exception:", e)
            print(url)
            time.sleep(5)
    if "openaccess.thecvf.com" in url and url.endswith(".html"):
#         return url[:-5].replace('html', 'papers') + ".pdf"
        try:
            soup = BeautifulSoup(requests.get(url).text, "html.parser")
            searched_title = soup.find(id="papertitle").text.strip().lower()
            if similarity(title, searched_title) > 0.6:
                return "https://openaccess.thecvf.com/" + soup.find('a', string='pdf').get('href').replace("../", "")
            else:
                print(f"OPENACCESS NOT MATCHED: {title} -- {searched_title}")
        except Exception as e:
            print("OPENACCESS Exception:", e)
            time.sleep(5)
    if 'arxiv.org/abs' in url:
#         return url.replace('abs', 'pdf')+'.pdf'
        try:
            soup = BeautifulSoup(requests.get(url).text, "html.parser")
            searched_title = ' '.join(soup.title.text.lower().split()[1:])
            if similarity(title, searched_title) > 0.8:
                return url.replace('abs', 'pdf')+'.pdf'
            else:
                print(f"ARXIV NOT MATCHED: {title} -- {searched_title}")
        except Exception as e:
            print("ARXIV Exception:", e)
            time.sleep(5)
    if 'aaai.org' in url or 'index.php/AAAI' in url:
        try:
            soup = BeautifulSoup(requests.get(url).text, "html.parser")
            searched_title = soup.find(class_="page_title").text.strip().lower()
            if similarity(title, searched_title) > 0.6:
                return soup.find(class_="obj_galley_link pdf").get("href")
            else:
                print(f"AAAI NOT MATCHED: {title} -- {searched_title}")
        except AttributeError as e:
            thepage = requests.get(url.replace('view', 'viewPaper'))
            soup = BeautifulSoup(thepage.text, "html.parser")
            searched_title = soup.find(id="title").text.strip().lower()
            if similarity(title, searched_title) > 0.6:
                print("AAAI: find it again")
                return soup.find("meta", attrs= {'name': 'citation_pdf_url'}).get("content")
            else:
                print(f"AAAI NOT MATCHED: {title} -- {searched_title}")
        except Exception as e:
            print("AAAI Exception:", e)
            time.sleep(5)


def search_pdf_link(title, conf, url):
    time.sleep(1)
    pdf_link = None
    user_agent = get_random_user_agent()
    pdf_link = parse_url(url, title)
    if pdf_link:
        return pdf_link
    search_string = ' '.join([title, conf, "pdf"])
    for j in search(search_string, num=3, stop=3, pause=2.0, user_agent=user_agent):
        pdf_link = parse_url(j, title)
        if pdf_link:
            return pdf_link
                 
    print(f"NOT SEARCHED: {title}")
    print(conf, url)
    return pdf_link

In [7]:
import fitz

# NTU, NUS, SUTD, SMU, SIT, SUSS
sg_universities = {
    'NTU':["Nanyang Technological University", "ntu.edu.sg", "NTU"],
    'NUS':["National University of Singapore", "nus.edu.sg", "NUS"],
    'SUTD': ["Singapore University of Technology and Design", "sutd.edu.sg", "SUTD"],
    'SMU':["Singapore Management University", "smu.edu.sg", "SMU"],
    'SIT': ["Singapore Institute of Technology","SingaporeTech.edu.sg", "singaporetech.edu.sg"],
    'SUSS': ["Singapore University of Social Sciences", "suss.edu.sg", "SUSS"]
    }
search_list = []
for uni, keywords in sg_universities.items():
    search_list += keywords

def get_key (dict, value):
    return [k for k, v in dict.items() if value in v]

#path = 'https://arxiv.org/pdf/1807.01440.pdf'
def find_sg_university(path):
    print("path:", path)
    uni = None
    for _ in range(3):
        try:
            data = requests.get(path).content
            doc = fitz.open(stream=data, filetype="pdf")
            break
        except RuntimeError:
            with open("pdfs/outlier.pdf", 'wb') as f:
                f.write(data)
            doc = fitz.open("pdfs/outlier.pdf")
            break
        except Exception as e:
            print(repr(e), "the pdf can not be parsed.")
    page = doc.loadPage(0)
    for word in search_list:
        x = page.search_for(word) #list
        hit = ''
        if len(x) > 0:
            for i in range(len(x)):
                hit_item = page.get_textbox(x[i])
                print("hit_item:", hit_item)
                hit += hit_item
                if hit in search_list:
                    print("hit:", hit)
                    uni = get_key(sg_universities, hit)[0]
                    print("uni:", uni)
                    return uni

In [53]:
import csv
keyword = 'person re-id'
header = ['conference', 'year', 'author', 'title', 'booktitle', 'bibtex', 'electronic_edition']

def outliers_length():
    with open("outliers.csv", 'r') as f:
        csvreader = csv.reader(f)
        outliers_length = len(list(csvreader))
        print("outliers_length:", outliers_length)
        return outliers_length

# while(outliers_length()>0):
with open(f'sg_{keyword}_papers.csv', 'a+', encoding='UTF8', newline='') as f1, open('outliers.csv', 'r', encoding='UTF8', newline='') as f2, open('outliers_bak.csv', 'w', encoding='UTF8', newline='') as f3:
    reader = csv.reader(f2)
    header = next(reader)
    writer1 = csv.writer(f1)
    writer2 = csv.writer(f3)
    writer2.writerow(header)
    for paper in reader:
        conf, year, author, title, booktitle, biburl, ee_link = paper
        try:
            arxiv_pdf_link = search_pdf_link(title, conf, ee_link)
            if arxiv_pdf_link: 
                uni = find_sg_university(arxiv_pdf_link)
                if uni:
                    info = author + title + booktitle 
                    writer1.writerow([uni, year, conf] + [info] + [biburl, ee_link])
            else:
                writer2.writerow(paper)
        except Exception as e:
            print("search pdf failed:", e)
            writer2.writerow(paper)
with open('outliers.csv', 'w', encoding='UTF8', newline='') as f1, open('outliers_bak.csv', 'r', encoding='UTF8', newline='') as f2:
    reader = csv.reader(f2)
    writer = csv.writer(f1)
    for row in reader:
        writer.writerow(row)
#     time.sleep(600)

path: https://sci-hub.se/downloads/2021-06-16/c8/peng2020.pdf#navpanes=0&view=FitH
path: https://sci-hub.se/downloads/2021-06-10/cf/gao2020.pdf#navpanes=0&view=FitH


In [49]:
data = requests.get("https://ojs.aaai.org/index.php/AAAI/article/view/3762/3640").content
# soup = BeautifulSoup(data, 'html.parser')
# soup.find("meta", attrs= {'name': 'citation_pdf_url'}).get("content")
doc = fitz.open(stream=data, filetype="pdf")

In [6]:
thepage = requests.get("https://sci-hub.ee/https://doi.org/10.1609/aaai.v33i01.33018738")
soup = BeautifulSoup(thepage.text, "html.parser")
type(soup.find(type="application/x-google-chrome-pdf"))

NoneType

In [108]:
from googlesearch import search
for j in search("SHaPE: A Novel Graph Theoretic Algorithm for Making Consensus-Based Decisions in Person Re-identification Systems", tld='co.id', num=3, stop=3, pause=2.0, user_agent=get_random_user_agent()):
    print(j)

https://ieeexplore.ieee.org/document/8237389
https://www.researchgate.net/publication/322060407_SHaPE_A_Novel_Graph_Theoretic_Algorithm_for_Making_Consensus-Based_Decisions_in_Person_Re-identification_Systems
https://www.semanticscholar.org/paper/SHaPE%3A-A-Novel-Graph-Theoretic-Algorithm-for-Making-Barman-Shah/18ab9be9af94f2bf4d3828161ffb232d1462526a
