In [10]:
import os 
import sys
import pandas as pd
from typing import List
from beautifultable import BeautifulTable
import camelot
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()
pdf_csv_path = os.getenv("pdf_csv_path")
base_url = os.getenv("base_url")


In [11]:
from ctypes.util import find_library
find_library("gs")

'libgs.so.10'

In [12]:
# use camelot to parse tables   
def get_tables(path: str, pages: List[int]):    
    for page in pages:
        table_list = camelot.read_pdf(path, pages=str(page))
        if table_list.n>0:
            for tab in range(table_list.n):
                
                # Conversion of the the tables into the dataframes.
                table_df = table_list[tab].df 
                
                table_df = (
                    table_df.rename(columns=table_df.iloc[0])
                    .drop(table_df.index[0])
                    .reset_index(drop=True)
                )        
                     
                table_df = table_df.apply(lambda x: x.str.replace('\n',''))
                
                # Change column names to be valid as XML tags
                table_df.columns = [col.replace('\n', ' ').replace(' ', '') for col in table_df.columns]
                table_df.columns = [col.replace('(', '').replace(')', '') for col in table_df.columns]
    
    return table_df
# extract data table from page number
df = get_tables(pdf_csv_path, pages=[3])

In [13]:
# prepare test set
eval_df = pd.DataFrame(columns=["Data Format", "Data raw"]) # , "Question", "Answer"

# Save the data in JSON format
data_json = df.to_json(orient='records')
eval_df.loc[len(eval_df)] = ["JSON", data_json]

# Save the data as a list of dictionaries
data_list_dict = df.to_dict(orient='records')
eval_df.loc[len(eval_df)] = ["DICT", data_list_dict]

# Save the data in CSV format
csv_data = df.to_csv(index=False)
eval_df.loc[len(eval_df)] = ["CSV", csv_data]

# Save the data in tab-separated format
tsv_data = df.to_csv(index=False, sep='\t')
eval_df.loc[len(eval_df)] = ["TSV (tab-separated)", tsv_data]

# Save the data in HTML format
html_data = df.to_html(index=False)
eval_df.loc[len(eval_df)] = ["HTML", html_data]

# Save the data in LaTeX format
latex_data = df.to_latex(index=False)
eval_df.loc[len(eval_df)] = ["LaTeX", latex_data]

# Save the data in Markdown format
markdown_data = df.to_markdown(index=False)
eval_df.loc[len(eval_df)] = ["Markdown", markdown_data]

# Save the data as a string
string_data = df.to_string(index=False)
eval_df.loc[len(eval_df)] = ["STRING", string_data]

# Save the data as a NumPy array
numpy_data = df.to_numpy()
eval_df.loc[len(eval_df)] = ["NumPy", numpy_data]

# Save the data in XML format
xml_data = df.to_xml(index=False)
eval_df.loc[len(eval_df)] = ["XML", xml_data]

In [14]:
from pandas import option_context
with option_context('display.max_colwidth', 150):
    display(eval_df.head(10))

Unnamed: 0,Data Format,Data raw
0,JSON,"[{""No."":""1"",""Name"":""Bernard Arnault &family"",""NetworthUSD"":""$233 billion"",""Age"":""75"",""Nationality"":""France"",""Primarysourcesofwealth"":""LVMH""},{""No...."
1,DICT,"[{'No.': '1', 'Name': 'Bernard Arnault &family', 'NetworthUSD': '$233 billion', 'Age': '75', 'Nationality': 'France', 'Primarysourcesofwealth': 'L..."
2,CSV,"No.,Name,NetworthUSD,Age,Nationality,Primarysourcesofwealth\n1,Bernard Arnault &family,$233 billion,75,France,LVMH\n2,Elon Musk,$195 billion,52,So..."
3,TSV (tab-separated),No.\tName\tNetworthUSD\tAge\tNationality\tPrimarysourcesofwealth\n1\tBernard Arnault &family\t$233 billion\t75\tFrance\tLVMH\n2\tElon Musk\t$195 b...
4,HTML,"<table border=""1"" class=""dataframe"">\n <thead>\n <tr style=""text-align: right;"">\n <th>No.</th>\n <th>Name</th>\n <th>NetworthU..."
5,LaTeX,\begin{tabular}{llllll}\n\toprule\nNo. & Name & NetworthUSD & Age & Nationality & Primarysourcesofwealth \\\n\midrule\n1 & Bernard Arnault &family...
6,Markdown,| No. | Name | NetworthUSD | Age | Nationality | Primarysourcesofwealth |\n|------:|:-------------...
7,STRING,No. Name NetworthUSD Age Nationality Primarysourcesofwealth\n 1 Bernard Arnault &family $233 billion 75...
8,NumPy,"[[1, Bernard Arnault &family, $233 billion, 75, France, LVMH], [2, Elon Musk, $195 billion, 52, South Africa Canada UnitedStates, Tesla, SpaceX], ..."
9,XML,<?xml version='1.0' encoding='utf-8'?>\n<data>\n <row>\n <No.>1</No.>\n <Name>Bernard Arnault &amp;family</Name>\n <NetworthUSD>$233 bil...


In [15]:
MESSAGE_SYSTEM_CONTENT = """You are a customer service agent that helps a customer with answering questions. 
Please answer the question based on the provided context below. 
Make sure not to make any changes to the context, if possible, when preparing answers to provide accurate responses. 
If the answer cannot be found in context, just politely say that you do not know, do not try to make up an answer."""

In [16]:
from openai import OpenAI

# client = AzureOpenAI(
#     api_key=OAI_API_Key, 
#     api_version=OAI_API_Version, 
#     azure_endpoint=OAI_API_Base)
client = OpenAI(base_url=base_url, api_key="lm-studio")

def response_test(question:str, context:str, model:str = "gpt-4"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": MESSAGE_SYSTEM_CONTENT,
            },
            {"role": "user", "content": question},
            {"role": "assistant", "content": context},
        ],
    )
    
    return response.choices[0].message.content

In [17]:
def run_question_test(query: str, eval_df:str):

    questions = []
    answers = []

    for index, row in eval_df.iterrows():
        questions.append(query)
        response = response_test(query, str(row['Data raw']))
        answers.append(response)
        
    eval_df['Question'] = questions
    eval_df['Answer'] = answers
    
    return eval_df

def BeautifulTableformat(query:str, results:pd.DataFrame, MaxWidth:int = 250):
    table = BeautifulTable(maxwidth=MaxWidth, default_alignment=BeautifulTable.ALIGN_LEFT)
    table.columns.header = ["Data Format", "Query", "Answer"]
    for index, row in results.iterrows():
        table.rows.append([row['Data Format'], query, row['Answer']])
    
    return table

In [18]:
query = "What's the Elon Musk's net worth?"
result_df1 = run_question_test(query, eval_df.copy())
table = BeautifulTableformat(query, result_df1, 150)
print(table)

ValueError: Unsupported Literal '\t' in string '11\tSergey Brin\t$113 billion\t49\tRussia/ UnitedStates Google'