# Research paper clustering challenge

## Data processing

Defined, and imported some necessary libraries

In [50]:
from pypdf import PdfReader
from pdfminer.high_level import extract_text
import pymupdf
import spacy
import os
import glob
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

paper_paths = glob.glob(os.path.join('dataset', '*'))
paper_path = "dataset/science.ade1499.pdf"

def counting_intros(paths):
    res = 0
    for path in paths:
        reader = PdfReader(path)
        
        for page in reader.pages:
            text = page.extract_text().lower().replace(" ", "")
            if "introduction" in text:
                res += 1
                break
    
    return res

print(len(paper_paths))
print(counting_intros(paper_paths))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/henryng101/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


79
76


In [51]:
# Trying PyPdf
reader = PdfReader(paper_path)
first_page = reader.pages[0]
text = first_page.extract_text().lower().split('\n')[:50]

for line in text:
    print("\"" + line + "\"")

"electrochemistry"
"la- and mn-doped cobalt spinel oxygen evolution"
"catalyst for proton exchange membrane electrolysis"
"lina chong1, guoping gao2, jianguo wen3, haixia li2, haiping xu1, zach green4, joshua d. sugar5,"
"a. jeremy kropf1, wenqian xu6, xiao-min lin3, hui xu4, lin-wang wang2, di-jia liu1,7*"
"discovery of earth-abundant electrocatalysts to replace iridium for the oxygen evolution reaction (oer)"
"in a proton exchange membrane water electrolyzer (pemwe) represents a critical step in reducing the"
"cost for green hydrogen production. we report a nanofibrous cobalt spinel catalyst codoped with"
"lanthanum (la) and manganese (mn) prepared from a zeolitic imidazolate framework embedded in"
"electrospun polymer fiber. the catalyst demonstrated a low overpotential of 353 millivolts at 10"
"milliamperes per square centimeter and a low degradation for oer over 360 hours in acidic electrolyte."
"a pemwe containing this catalyst at the anode demonstrated a current density of 2000 

In [52]:
# Trying PdfMiner
text = extract_text(paper_path).lower().split('\n')[:100]

for line in text:
    print("\"" + line + "\"")

"res earch"
""
"electrochemistry"
""
"la- and mn-doped cobalt spinel oxygen evolution"
"catalyst for proton exchange membrane electrolysis"
""
"lina chong1, guoping gao2, jianguo wen3, haixia li2, haiping xu1, zach green4, joshua d. sugar5,"
"a. jeremy kropf1, wenqian xu6, xiao-min lin3, hui xu4, lin-wang wang2, di-jia liu1,7*"
""
"discovery of earth-abundant electrocatalysts to replace iridium for the oxygen evolution reaction (oer)"
"in a proton exchange membrane water electrolyzer (pemwe) represents a critical step in reducing the"
"cost for green hydrogen production. we report a nanofibrous cobalt spinel catalyst codoped with"
"lanthanum (la) and manganese (mn) prepared from a zeolitic imidazolate framework embedded in"
"electrospun polymer fiber. the catalyst demonstrated a low overpotential of 353 millivolts at 10"
"milliamperes per square centimeter and a low degradation for oer over 360 hours in acidic electrolyte."
"a pemwe containing this catalyst at the anode demonstrated a 

In [53]:
# Trying PyMuPDF
doc = pymupdf.open(paper_path)
first_page = doc.load_page(0).get_text()
text = first_page.lower().split('\n')[:50]

for line in text:
    print("\"" + line + "\"")

"electrochemistry"
"la- and mn-doped cobalt spinel oxygen evolution"
"catalyst for proton exchange membrane electrolysis"
"lina chong1, guoping gao2, jianguo wen3, haixia li2, haiping xu1, zach green4, joshua d. sugar5,"
"a. jeremy kropf1, wenqian xu6, xiao-min lin3, hui xu4, lin-wang wang2, di-jia liu1,7*"
"discovery of earth-abundant electrocatalysts to replace iridium for the oxygen evolution reaction (oer)"
"in a proton exchange membrane water electrolyzer (pemwe) represents a critical step in reducing the"
"cost for green hydrogen production. we report a nanofibrous cobalt spinel catalyst codoped with"
"lanthanum (la) and manganese (mn) prepared from a zeolitic imidazolate framework embedded in"
"electrospun polymer fiber. the catalyst demonstrated a low overpotential of 353 millivolts at 10"
"milliamperes per square centimeter and a low degradation for oer over 360 hours in acidic electrolyte."
"a pemwe containing this catalyst at the anode demonstrated a current density of 2000 

So, after some testing on the three libs: PyPDF, PyMuPDF and PDFMiner, I decided that PDFMiner suits best for the purpose

In [62]:
def extract_abstract(path):
    text = extract_text(path).lower().split('\n')   # Extract content using PDFMiner
    abstract_content = ""
    tokens = []

    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  # Disable unnecessary components for faster processing

    for line in text:
        if len(line) == 0:
            tokens = nlp(abstract_content)
            
            if len(tokens) > 100 and len(tokens) < 500:
                break
            else:
                abstract_content = ""
        else:
            abstract_content += line + " "

    lemmas = [token.lemma_ for token in tokens if not token.is_stop]
    print(len(lemmas))
    return lemmas

papers_tokens = []
for path in paper_paths:
    papers_tokens.append(extract_abstract(path))

160
106
260


KeyboardInterrupt: 