#### Notes
Need to run `pip install PyPaperBot` first, if the Python module is not yet installed.

See https://github.com/ferru97/PyPaperBot for more information about the module PythonPaperBot.

This script works in Windows. To run in Linux, change the line `process=subprocess.Popen(["powershell.exe","-Command",command],stdout=subprocess.PIPE)` and switch out powershell.exe for the Linux executables.

#### Instruction

- Create an empty folder and put this Jupyter Notebook in it. Make sure the folder path is not too long (if working on a network drive with very long path: mount the path as net drive first to shorten the path).

- Put the keywords you want to search for in Keywords\_\*.txt files. You can create multiple Keywords\_\*.txt files. The python script will generate all possible combinations of keywords and store them in a text file "Search_Terms.txt"

- Change the variable `download_number_per_keyword` and `year_from` according to your preference.
- Run the "Download papers" section. The script will create a new folder for each search terms and store the downloaded paper inside.

#### Get a tor session
This part does not work as tor is apparently blocked by my ISP. If you can get a tor proxy, you can add proxy settings in the PyPyperBot argument.

In [None]:
import requests
def get_tor_session():
    from tor_proxy import tor_proxy
    port=tor_proxy()
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:'+port,
                       'https': 'socks5://127.0.0.1:'+port}
    return session
session = get_tor_session()
print(session.get("http://httpbin.org/ip").text)

#### Download papers

In [None]:
import os
import itertools
import glob
import pandas
download_number_per_keyword=80 # Try to download the first so many search results. Each page of Google Scholar contains 10 search results, so 80 will mean downloading all papers shown in the first 8 pages.
year_from=1956 # Limitation of how old the paper can be
KeywordsListFiles=glob.glob("Keywords_*.txt")
Keywords=[[]]*len(KeywordsListFiles)
for i in range(len(KeywordsListFiles)):
    with open(KeywordsListFiles[i],'r',encoding='utf-8') as f:
        keyw=f.read().splitlines()
        Keywords[i]=keyw
    i+=1
Keywords_Combinations=list(itertools.product(*Keywords))
Keywords_Combinations.reverse() # Make sure the latest added keywords are searched first
f=open("Search_Terms.txt",'w')
f.close()
import subprocess
import sys
# ________Get a tor proxy_________
from datetime import datetime
from datetime import timedelta
# from dateutil.relativedelta import relativedelta
# last_time_proxy_change = datetime.now() - relativedelta(years=1) # Put the first time stamp one year ago
for keywordcombi in Keywords_Combinations:
    # if datetime.now() - last_time_proxy_change > timedelta(hours=8):
    #     tor_port=tor_proxy() # Get a new proxy 
    #     last_time_proxy_change=datetime.now()
    combi=" ".join(keywordcombi)
    with open("Search_Terms.txt",'a+',encoding='utf-8') as f:
        f.write(combi+"\n")
    already_downloaded=False
    try:
        os.mkdir(combi)
    except FileExistsError:
        try:
            Downloaded_Papers=pandas.read_csv(combi+"\\result.csv",header=0)
            if Downloaded_Papers["Downloaded"].sum() > download_number_per_keyword * 0.5:
                already_downloaded=True
        except FileNotFoundError:
            print("Folder \""+combi+"\" is created but empty")
    # if already_downloaded:
    #     print("There's already some documents downloaded for the title: "+combi)
    if not already_downloaded:
        command="python.exe -m PyPaperBot --query=\""+combi+"\" --scholar-pages="+str(round(download_number_per_keyword/10))+"  --min-year="+str(year_from)+" --dwn-dir=\".\\"+combi+"\""
        # command="for($i=1;$i -lt 3; $i++) {Write-Host \"$($i) "+combi+"\"; Start-Sleep -s 1}" # This is a test command to see if the following subprocess call works.
        print("Searching for \""+combi+"\" and downloading...")
        process=subprocess.Popen(["powershell.exe","-Command",command],stdout=subprocess.PIPE)
        for c in iter(lambda: process.stdout.read(1), b''):
            sys.stdout.write(c.decode('utf-8','ignore'))
import glob
for f in glob.glob('./**/(2)*.pdf',recursive=True):
    os.remove(f) # remove files with (2) in the beginning
    print("\"",f,"\"removed.")

#### Check repeated files and remove duplicated files

This part will create a file called Duplication.json

If the "Download paper" session was interrupted while downloading and then run again, the already downloaded paper will be downloaded again with "(2)" added to the beginning of file name. The following script assumes that all files with "(2)" in its name is a duplicate and will delete it.

In [None]:
import glob
import os
for f in glob.glob('./**/(2)*.pdf',recursive=True):
    os.remove(f) # remove files with (2) in the beginning
    print("\"",f,"\"removed.")
import pandas
import json
Downloaded_PDFs={}
for (dirpath, dirnames, filenames) in os.walk(os.getcwd()):
    if ".pdf" in str(filenames):
        for Idx in ["bibtex.bib","result.csv","Thumbs.db"]:
            try:
                filenames.remove(Idx) # Remove bibtex.bib, result.csv and possibly Thumbs.db from the scanned list, leaving only PDF files.
            except:
                pass
        Downloaded_PDFs[os.path.basename(dirpath)]=filenames # This is a dictionary
Downloaded_Keywords=Downloaded_PDFs.keys() # this is essentially the same as the variable "combi" above, but may have different sorting. So here another variable. 
Downloaded_PDFs_List=Downloaded_PDFs.values()
Downloaded_PDFs_List_Flat=[
    x
    for xs in Downloaded_PDFs_List
    for x in xs
]
Downloaded_PDF_List_Full_NoDuplicate=list(dict.fromkeys(Downloaded_PDFs_List_Flat))
PDF_Repitition_Count=[1]*len(Downloaded_PDFs_List_Flat)
PDF_File_Already_Appeared=[False]*len(Downloaded_PDFs_List_Flat)
PDF_Repitition_List={}
for i in range(len(Downloaded_PDFs_List_Flat)):
    PDF_Repitition_Count[i]=Downloaded_PDFs_List_Flat.count(Downloaded_PDFs_List_Flat[i])
    if PDF_Repitition_Count[i] > 1:
        if Downloaded_PDFs_List_Flat[i] in Downloaded_PDFs_List_Flat[:i]:
            PDF_File_Already_Appeared[i]=True
        if not PDF_File_Already_Appeared[i]:
            PDF_Appear_In=[]
            for key in Downloaded_PDFs:
                if Downloaded_PDFs_List_Flat[i] in Downloaded_PDFs[key]:
                    PDF_Appear_In.append(key)
            PDF_Repitition_List[Downloaded_PDFs_List_Flat[i]]=PDF_Appear_In
print(str(len(Downloaded_PDFs_List_Flat)),"PDFs downloaded. Unique PDF files:",str(len(Downloaded_PDF_List_Full_NoDuplicate)))
# PDF_Repitition_Table=pandas.DataFrame(dict([(key, pandas.Series(value)) for key, value in PDF_Repitition_List.items()]))
import json
with open("Duplication.json","w",encoding="utf-8") as fp:
    json.dump(PDF_Repitition_List,fp)
with open("Duplication.json",'r',encoding="utf-8") as f:
    PDF_Repitition_List=json.load(f)
for key in PDF_Repitition_List:
    for i in range(len(PDF_Repitition_List[key])):
        if i !=0:
            Folder_Name=PDF_Repitition_List[key][i]
            File_Name=key.split(".pdf")[0]+".pdf"
            try:
                os.remove(Folder_Name+'\\'+File_Name)
                print('Duplicated PDF file "'+Folder_Name+'\\'+File_Name+'" removed.')
            except FileNotFoundError:
                print('Duplicated PDF file "'+Folder_Name+'\\'+File_Name+'" was already removed.')

#### List papers by DOI

This part will check the `result.csv` file from each subfolder and make a list (pandas DataFrame) of all downloaded papers.

If paper download of one specific search term is interrupted, the download will be incomplete and no `result.csv` or `bibtex.bib` file will be created. The folder will be skipped.

The list (DataFrame) will be stored in `List_Papers.json` and will look like this:
```
{
        "Name": "[Title of the paper]",
        "DOI": "[DOI of the paper]",
        "Year": [Year in int],
        "Journal": "[Journal Name]",
        "Search terms": [
            "List",
            "of",
            "Search",
            "Terms",
            "where",
            "this",
            "paper",
            "appeared"
        ]
}
```

In [2]:
import os.path
import pandas
Columns_in_CSV=['Name','DOI','Year','Journal','PDF Name'] #'Scholar Link','Authors'
List_Papers=pandas.DataFrame(columns=Columns_in_CSV+['Search term']) #{"doi":[], "title":[], "searchterms":[] }
for dirpath, dirnames, filenames in os.walk("."):
    for filename in [f for f in filenames if f=="result.csv"]: # f.endswith(".csv")]:
        search_term=dirpath.replace(".\\","")
        search_results=pandas.read_csv(os.path.join(dirpath, filename),header=0)
        search_results.drop(columns=search_results.columns.difference(Columns_in_CSV+['Downloaded']),inplace=True) 
        search_results=search_results.loc[search_results.Downloaded,:] # Remove not downloaded entries (where "Downloaded=False")
        search_results.drop(columns=['Downloaded'],inplace=True) 
        search_results["Search term"]=[search_term]*len(search_results)
        List_Papers=pandas.concat([List_Papers,search_results])
List_Papers.fillna("Unknown",inplace=True)
List_Papers["Duplicated"]=List_Papers["PDF Name"].duplicated()
List_Papers["Duplicated_Count"]=[0]*len(List_Papers.index) # Create a new column in DataFrame with zeros. len(List_Papers.index) can also be List_Papers.shape[0] (shape returns a 2D-tuple with num of rows and num of columns)
List_Papers["Duplicated_First_Occurance"]=[-1]*len(List_Papers.index)
Downloaded_Papers=[]
# for dirpath, dirnames, filenames in os.walk("."):
#     for filename in [f for f in filenames if f.endswith(".pdf")]:
#         Downloaded_Papers=Downloaded_Papers+[filename]
# Papers_NoDuplicate=List_Papers[List_Papers["PDF Name"].isin(Downloaded_Papers)]
for i in range(1,len(List_Papers.index)):
    if list(List_Papers["Duplicated"])[i]:
        Current_Paper_Name=List_Papers.iloc[i,List_Papers.columns.get_loc("PDF Name")]
        Previous_Papers=list(List_Papers["PDF Name"])[slice(i)]
        List_Papers.iloc[i,List_Papers.columns.get_loc("Duplicated_Count")]=Previous_Papers.count(Current_Paper_Name)
        Current_Paper_First_Occurance=Previous_Papers.index(Current_Paper_Name)
        List_Papers.iloc[i,List_Papers.columns.get_loc("Duplicated_First_Occurance")]=Current_Paper_First_Occurance
        Search_Terms=List_Papers.iloc[Current_Paper_First_Occurance,List_Papers.columns.get_loc("Search term")]+","+List_Papers.iloc[i,List_Papers.columns.get_loc("Search term")]
        List_Papers.iloc[Current_Paper_First_Occurance,List_Papers.columns.get_loc("Search term")]=Search_Terms
List_Papers=List_Papers.loc[~List_Papers["Duplicated"],:]
Search_Terms_Lists=[ [] for _ in range(len(List_Papers.index)) ]
List_Papers.drop(list(List_Papers.filter(regex='Duplicated|PDF Name')), axis=1, inplace=True)
for i in range(len(List_Papers.index)):
    Search_Terms_List=List_Papers.iloc[i,List_Papers.columns.get_loc("Search term")].split(",")
    Search_Terms_Lists[i]=Search_Terms_List
List_Papers["Search terms"]=Search_Terms_Lists
List_Papers.drop(columns=['Search term'],inplace=True)
List_Papers.to_json("List_Papers.json",orient="records")
import json
with open("List_Papers.json",'r',encoding="utf-8") as fp:
    JSON_Data=json.load(fp)
    JSON_Data_Formatted=json.dumps(JSON_Data,indent=4)
with open("List_Papers.json",'w',encoding="utf-8") as fp:
    fp.write(JSON_Data_Formatted)