#  PDF to CSV pipeline

This Notebook aims to experiment and tests different functions if the usual python pipeline didn't work or you want to change some things about th csv output, add columns....
This pipelines suppose you have the following dependencied instaled : 
- Python 3
- Java 8+

And the following python libraries :
- pandas
- tabula-py
- pdfplumber

You can installed them executing the following cell : 

In [None]:
#pip install pdfplumber tabula-py pandas

The pipeline proceeds to extract tables and element names in the following order : 
- [Title extraction](#title-extraction) with pdfblumber
- Table extraction and associations with tabula
- Additional metadata parsing (ex: Downgrades, name of the elements) using pandas

## Initial setup

In [None]:
import pdfplumber
import pandas as pd
import tabula
import re
import logging

The Following parameters will be used thought the programs to make certains assumptions on wether this table is valid or not and will be used to constute the Header.

In [None]:
minColumns=13
GOE=[str(i) for i in range(-5,0)]+["BASE"]+[str(i) for i in range(1,6)]
Header=["Levels","ElmtNot","AFNot"]+GOE
nbrCol=14

# Title Extraction



This Class handles the management of `pdfplumber`

In [None]:
class PDFLoader:
    def __init__(self, filename):
        try:
            self.pdf = pdfplumber.open(filename)
        except Exception as e:
            raise RuntimeError(f"Unable to open PDF {filename}: {e}")
        self.filename=filename

    def get_page_lines(self, pagenumber):
        try:
            page = self.pdf.pages[pagenumber - 1]
            return page.extract_text_lines(return_chars=False)
        except IndexError:
            logging.error(f"Page {pagenumber} does not exist")
            return None
        except Exception as e:
            logging.error(f"Error on page {pagenumber}: {e}")
            return None
    
    def get_all_pages_lines(self,beginpage,endpage):
        lines = []
        for page in range(beginpage, endpage +1):
            lines+= self.pdf.pages[page-1].extract_text_lines(return_chars=False)
        return lines

    def close(self):
        self.pdf.close()

We defenite the element class to store the Element infos. This will be used int he next part.

In [None]:
class Element:
    def __init__(self, Category, Element, Symbol):
        self.Category = Category
        self.Element = Element
        self.Symbol = Symbol

In [None]:
pattern = re.compile(
    r"""
    ^(?:\d+\.\s*)?              # optional Number (ex: "1. ")
    ([A-Z ]+?)                  # Main category (ex: "ARTISTIC ELEMENT")
    (?:\s*-\s*([A-Za-z ]+))?    # sub-element after "-" (ex: "Artistic Block")
    (?:\s*\(([A-Za-z]+)\))?     # symbol between parenthesis (ex: "AB")
    $""",
    re.VERBOSE
)

def FindElementName(lines):
    ListeElem=[]
    Groupe=""
    for line in lines:
        m = pattern.match(line["text"].strip())
        if m:
            cat, sub, sym = m.groups()
            if not sym:
                logging.debug(f"Group pattern Found : {cat}")
                Groupe=cat
            
            else:
                if sub :
                    
                    if Groupe:
                        logging.debug("\t Sub Pattern found : " , sub)
                        ListeElem.append(Element(Groupe,sub,sym))
                    else:
                        logging.debug("\t Sub patttern whitout group announced : " , sub)
                        ListeElem.append(Element(cat,sub,sym))
                    
                else:
                    logging.debug("Simple element found", cat)
                    ListeElem.append(Element(cat,cat,sym))
    return ListeElem

# Table extraction and associaiton

In [None]:
def return_all_tables(filename,beginpage,endpage):
    pages=[i for i in range (beginpage,endpage +1)] 
    ListTable=tabula.read_pdf(filename, pages=pages,pandas_options={"header":None},guess=True,columns=[250,450,640,830,940,1050,1160,1270,1380,1490,1600,1710,1820,1930,2040])
    return ListTable

In [None]:
def CleanNonElementsTable(pagedf):
    i=0
    logging.info("Verifing if all df are the right size")
    while i < len(pagedf):

        if pagedf[i].shape[1]<minColumns:           
            pagedf.pop(i)
            logging.debug(f"table number {i} removed")
            logging.debug(f"Element number {i+1} and after are falling in {i}")
            i-=1
        i+=1
    logging.info("Verification done")

In [None]:
def TitleAsManyTable(dfs,TitleList):
    if len(dfs)!=len(TitleList):
        logging.warning("Error, not as many title as tables")
        logging.warning(f"{len(dfs)} dataframes while  having {len(TitleList)} Titles")
        return False
    else :
        return True

In [None]:
def CleanNaNLines(dfs):
    for i in range(len(dfs)):
        if dfs[i].isna().all(axis=1).any():
            logging.debug(f"nan line found  in number {i} dataframe ")
            dfs[i]=dfs[i].dropna(how="all")
        else:
            logging.debug(f"no nan found in number {i} dataframe")

In [None]:
def SetColumns(dfs,Columns=nbrCol):
    for df in dfs:
        if df.shape[1]<Columns:
            df.insert(loc=2, column='new', value=pd.NA) ##Additional Feature
            logging.debug(f"one table size {df.shape[1]} resized to {Columns}")

In [None]:
def SetColumnName(dfs):
    for df in dfs:
        if df.shape[1]!=nbrCol: ## NOMBRE VARIBLE COLONNES
            logging.warning("One dataframe isn't the right size, return")
        else:
            df.columns=Header

In [None]:
def TableAsso(dfs,ListElem):
    indexList=0
    for df in dfs:
        df.insert(loc=2, column="Element", value=ListElem[indexList].Symbol)
        df.insert(loc=0, column="ElmtName",value=ListElem[indexList].Element)
        df.insert(loc=0, column="Category",value=ListElem[indexList].Category)
        indexList+=1

We only verify the first row each Element dataframe. This way even if there's downgrades for the element this will never be the the first one which is assumed to be `Element Symbol` + `Element Level`.  
We compare `ElmntNot` which comes from the table extracted by tabula, and `Element` which comes from the text extraction.

In [None]:
def VerifyAsso(dfs):
    Associated=True
    for i in range(len(dfs)):
        df=dfs[i]
        if df["ElmtNot"][0][:-1]!=df["Element"][0]: # Always Elem Lvl B
            logging.error(f"Something whent wrong with association : {df["ElmtNot"][0][:-1]} is not {df["Element"][0]}")
            Associated=False
    if not Associated:
        logging.info("Association went right")
    return Associated

# Adding and completing dataframe Info

In [None]:
def AddDowngrades(df):
    df["DGrade"]=df["ElmtNot"].astype(str).str.count("<").fillna(0).astype(int)
    logging.info("Downgrades column added")

In [None]:
def LevelComplete(df):
    logging.info("Verifying if all Levels are There")

    if df["Levels"].isna().any():
        logging.info("NaN entries in Levels found")
        df["Levels"] = df["Levels"].ffill()
        
        if not df["Levels"].isna().any():
            logging.info("Completion done")
        else:
            logging.warning("Something went wrong with completion")
    else:
        logging.info("Levels There")


Separate the Element Lvl of `ElmntNot`. This part uses the `DGrade` column as they are added in the end of the `ElmntNot`.  
Ex : `ElmntNot`: `ME3<<` --> `ElmntLvl` : `3`

In [None]:

def ExtractElementLvl(df):
    s=df["ElmtNot"]
    n=df["DGrade"]
    if n==0:
        return s[-1:]
    else:
        return s[-n-1:-n]

def ElementLvl(df):
    df["ElmntLvl"]=df.apply(ExtractElementLvl,axis="columns")
    logging.info("Element Level Added")

Creates a column like "Element" and "ElmntLvl" for the Additional Feature.   

Ex : `pi3 --> "AddFeat"=pi,"AFLvl"=3`

In [None]:
def ExtractFeat(val):
    if pd.isna(val) or val == "-":
        return (val, pd.NA)
    return (val[:-1], val[-1:])

def AddFeat(df):
    df[["AddFeat", "AFLvl"]] = df["AFNot"].apply(ExtractFeat).apply(pd.Series)
    logging.info("Additional Feature Added")

# Dataframe Build
This part is the part where all the functions are put together in single ones. it is also a way to test the pipeline.

In [None]:
def all_pages_into_df(pdfLoader:PDFLoader,beginpage,endpage):
    lines=pdfLoader.get_all_pages_lines(beginpage,endpage)
    pdfLoader.close()
    ListElem=FindElementName(lines)
    dfs=return_all_tables(pdfLoader.filename,beginpage,endpage)
    
    CleanNonElementsTable(dfs)
    if TitleAsManyTable(dfs,ListElem):
        CleanNaNLines(dfs)
        
        SetColumns(dfs)
        SetColumnName(dfs)
        TableAsso(dfs,ListElem)

    else:
        logging.warning("Not as Many Element as tables")
        return 
    return dfs