# Import and define functions

Imports libraries

In [31]:
import requests
import pandas as pd
import re
import os
import string
from bs4 import BeautifulSoup as bs

<b>Function:</b> Extract the download link from the Sci-Hub page using the save button's onclick event

In [32]:
# Defining the necessary functions from sci_hub.py to integrate into our main function

def get_links(target):
    """Extract the download link from the Sci-Hub page using the save button's onclick event."""
    response = requests.get(target)
    soup = bs(response.content, "lxml")
    
    try:
        # Find the button with the download link
        button = soup.find("button", onclick=True)
        onclick_content = button['onclick']
        mirror = re.search(r"location.href='(.*?)'", onclick_content).group(1)
        if mirror.startswith('//'):
            mirror = 'https:' + mirror
    except Exception:
        print("Mirror not found")
        mirror = ""
        
    try:
        doi = soup.title.text.split("|")[-1].strip()
    except Exception:
        print("DOI not found")
        doi = ""
        
    return doi, mirror


<b>Function:</b> Download paper from the mirror URL and save it with the title as filename.

In [33]:
def download_paper(mirror_url, title):
    """
    Download paper from the mirror URL and save it with the title as filename.
    """
    try:
        # Ensure the URL starts with https://
        if not mirror_url.startswith('http'):
            mirror_url = 'https:' + mirror_url
            
        response = requests.get(mirror_url, stream=True)
        # Create a valid filename from the title
        file_name = valid_file_name(title) + ".pdf"
        with open(file_name, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded {file_name}")
        return file_name
    except Exception as e:
        print(f"Failed to download. Error: {e}")
        return None

<b>Function:</b> Remove any invalid characters from the title to make it a valid filename.

In [34]:
def valid_file_name(title):
    """
    Remove any invalid characters from the title to make it a valid filename.
    """
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    return ''.join(c for c in title if c in valid_chars)


<b>Function:</b> Downloads papers from SciHub based on DOI column in supplied df

In [38]:
# Updated main function to include the renaming process
def download_papers_from_df(df, doi_column, title_column):
    """
    Downloads papers from SciHub based on DOI column in supplied df
    """
    base_url = "https://sci-hub.se/"
    downloaded_files = []
    
    for index, row in df.iterrows():
        # Check if the DOI value is a string
        if isinstance(row[doi_column], str):
            target_url = base_url + row[doi_column]
            doi, mirror = get_links(target_url)
            if mirror:
                file_name = download_paper(mirror, row[title_column])
                if file_name:
                    downloaded_files.append(file_name)
        else:
            print(f"Skipping row {index} due to missing or invalid DOI/Title.")
                
    return downloaded_files

# Imports title and DOI lists from CSV

Import dataframes containing doi and title of papters of interest from CSV

In [36]:
publication_info = pd.read_csv('publication_info.csv')
publication_info.head()

Unnamed: 0,title,authors,doi,pubmed_id,abstract
0,Potassium efflux induced by a new lactoferrin-...,"Viejo-Diaz, M. et al.",10.1023/a:1022657630698,12693969.0,A 31-amino acid synthetic peptide (NH(2)-FFSAS...
1,Isolation and characterization of opioid antag...,"Tani, F. et al.",10.1271/bbb1961.54.1803,1369293.0,Peptides with affinity for opioid receptors we...
2,Isolation and characterization of opioid antag...,"Tani, F. et al.",10.1271/bbb1961.54.1803,1369293.0,Peptides with affinity for opioid receptors we...
3,Isolation and characterization of opioid antag...,"Tani, F. et al.",10.1271/bbb1961.54.1803,1369293.0,Peptides with affinity for opioid receptors we...
4,"Effects of 'casoparan', a peptide isolated fro...","Lebrun, I. et al.",10.1080/09629350400003068,15545057.0,"Casein, a protein found in milk of several spe..."


# Downloads pdfs from SciHub

Downloads the PDFs to your parent directory based on DOI # or Title of the publications.

In [37]:
downloaded_files = download_papers_from_df(publication_info, 'doi', 'title')

Downloaded Potassium efflux induced by a new lactoferrin-derived peptide mimicking the effect of native human lactoferrin on the bacterial cytoplasmic membrane..pdf
Downloaded Isolation and characterization of opioid antagonist peptides derived from human lactoferrin..pdf
Downloaded Isolation and characterization of opioid antagonist peptides derived from human lactoferrin..pdf
Downloaded Isolation and characterization of opioid antagonist peptides derived from human lactoferrin..pdf
Failed to download. Error: Invalid URL 'https:/downloads/2019-01-29//be/lebrun2004.pdf?download=true': No host supplied
Downloaded Isolation and characterization of a new bradykinin potentiating octapeptide from gamma-casein..pdf
Downloaded Biochemical and pharmacological aspects of two bradykinin-potentiating peptides obtained from tryptic hydrolysis of casein..pdf
Downloaded Casocidin-I a casein-alpha s2 derived peptide exhibits antibacterial activity..pdf
Mirror not found
Downloaded Identification of th