In [12]:

import sys, os, re, argparse
import unicodedata
from codecs import open
from math import isnan, isinf
from abc import ABCMeta, abstractmethod


################ String Normalization ################

def normalize(x):
    if not isinstance(x, str):
        x = x.decode('utf8', errors='ignore')
    # Remove diacritics
    x = ''.join(c for c in unicodedata.normalize('NFKD', x)
                if unicodedata.category(c) != 'Mn')
    # Normalize quotes and dashes
    x = re.sub(r"[‘’´`]", "'", x)
    x = re.sub(r"[“”]", "\"", x)
    x = re.sub(r"[‐‑‒–—−]", "-", x)
    while True:
        old_x = x
        # Remove citations
        x = re.sub(r"((?<!^)\[[^\]]*\]|\[\d+\]|[•♦†‡*#+])*$", "", x.strip())
        # Remove details in parenthesis
        x = re.sub(r"(?<!^)( \([^)]*\))*$", "", x.strip())
        # Remove outermost quotation mark
        x = re.sub(r'^"([^"]*)"$', r'\1', x.strip())
        if x == old_x:
            break
    # Remove final '.'
    if x and x[-1] == '.':
        x = x[:-1]
    # Collapse whitespaces and convert to lower case
    x = re.sub(r'\s+', ' ', x, flags=re.U).lower().strip()
    return x


################ Value Types ################

class Value(object):
    __metaclass__ = ABCMeta

    # Should be populated with the normalized string
    _normalized = None

    @abstractmethod
    def match(self, other):
        """Return True if the value matches the other value.
        Args:
            other (Value)
        Returns:
            a boolean
        """
        pass

    @property
    def normalized(self):
        return self._normalized


class StringValue(Value):

    def __init__(self, content):
        assert isinstance(content, str)
        self._normalized = normalize(content)
        self._hash = hash(self._normalized)

    def __eq__(self, other):
        return isinstance(other, StringValue) and self.normalized == other.normalized

    def __hash__(self):
        return self._hash

    def __str__(self):
        return 'S' + str([self.normalized])

    __repr__ = __str__

    def match(self, other):
        assert isinstance(other, Value)
        return self.normalized == other.normalized


class NumberValue(Value):

    def __init__(self, amount, original_string=None):
        assert isinstance(amount, (int, float))
        if abs(amount - round(amount)) < 1e-6:
            self._amount = int(amount)
        else:
            self._amount = float(amount)
        if not original_string:
            self._normalized = str(self._amount)
        else:
            self._normalized = normalize(original_string)
        self._hash = hash(self._amount)

    @property
    def amount(self):
        return self._amount

    def __eq__(self, other):
        return isinstance(other, NumberValue) and self.amount == other.amount

    def __hash__(self):
        return self._hash

    def __str__(self):
        return ('N(%f)' % self.amount) + str([self.normalized])

    __repr__ = __str__

    def match(self, other):
        assert isinstance(other, Value)
        if self.normalized == other.normalized:
            return True
        if isinstance(other, NumberValue):
            return abs(self.amount - other.amount) < 1e-6
        return False

    @staticmethod
    def parse(text):
        """Try to parse into a number.
        Return:
            the number (int or float) if successful; otherwise None.
        """
        try:
            return int(text)
        except:
            try:
                amount = float(text)
                assert not isnan(amount) and not isinf(amount)
                return amount
            except:
                return None


class DateValue(Value):

    def __init__(self, year, month, day, original_string=None):
        """Create a new DateValue. Placeholders are marked as -1."""
        assert isinstance(year, int)
        assert isinstance(month, int) and (month == -1 or 1 <= month <= 12)
        assert isinstance(day, int) and (day == -1 or 1 <= day <= 31)
        assert not (year == month == day == -1)
        self._year = year
        self._month = month
        self._day = day
        if not original_string:
            self._normalized = '{}-{}-{}'.format(
                year if year != -1 else 'xx',
                month if month != -1 else 'xx',
                day if day != '-1' else 'xx')
        else:
            self._normalized = normalize(original_string)
        self._hash = hash((self._year, self._month, self._day))

    @property
    def ymd(self):
        return (self._year, self._month, self._day)

    def __eq__(self, other):
        return isinstance(other, DateValue) and self.ymd == other.ymd

    def __hash__(self):
        return self._hash

    def __str__(self):
        return (('D(%d,%d,%d)' % (self._year, self._month, self._day))
                + str([self._normalized]))

    __repr__ = __str__

    def match(self, other):
        assert isinstance(other, Value)
        if self.normalized == other.normalized:
            return True
        if isinstance(other, DateValue):
            return self.ymd == other.ymd
        return False

    @staticmethod
    def parse(text):
        """Try to parse into a date.
        Return:
            tuple (year, month, date) if successful; otherwise None.
        """
        try:
            ymd = text.lower().split('-')
            assert len(ymd) == 3
            year = -1 if ymd[0] in ('xx', 'xxxx') else int(ymd[0])
            month = -1 if ymd[1] == 'xx' else int(ymd[1])
            day = -1 if ymd[2] == 'xx' else int(ymd[2])
            assert not (year == month == day == -1)
            assert month == -1 or 1 <= month <= 12
            assert day == -1 or 1 <= day <= 31
            return (year, month, day)
        except:
            return None


################ Value Instantiation ################

def to_value(original_string, corenlp_value=None):
    """Convert the string to Value object.
    Args:
        original_string (basestring): Original string
        corenlp_value (basestring): Optional value returned from CoreNLP
    Returns:
        Value
    """
    if isinstance(original_string, Value):
        # Already a Value
        return original_string
    if not corenlp_value:
        corenlp_value = original_string
    # Number?
    amount = NumberValue.parse(corenlp_value)
    if amount is not None:
        return NumberValue(amount, original_string)
    # Date?
    ymd = DateValue.parse(corenlp_value)
    if ymd is not None:
        if ymd[1] == ymd[2] == -1:
            return NumberValue(ymd[0], original_string)
        else:
            return DateValue(ymd[0], ymd[1], ymd[2], original_string)
    # String.
    return StringValue(original_string)


def to_value_list(original_strings, corenlp_values=None):
    """Convert a list of strings to a list of Values
    Args:
        original_strings (list[basestring])
        corenlp_values (list[basestring or None])
    Returns:
        list[Value]
    """
    assert isinstance(original_strings, (list, tuple, set))
    if corenlp_values is not None:
        assert isinstance(corenlp_values, (list, tuple, set))
        assert len(original_strings) == len(corenlp_values)
        return list(set(to_value(x, y) for (x, y)
                        in zip(original_strings, corenlp_values)))
    else:
        return list(set(to_value(x) for x in original_strings))


################ Check the Predicted Denotations ################

def check_denotation(target_values, predicted_values):
    """Return True if the predicted denotation is correct.

    Args:
        target_values (list[Value])
        predicted_values (list[Value])
    Returns:
        bool
    """
    # Check size
    if len(target_values) != len(predicted_values):
        return False
    # Check items
    for target in target_values:
        if not any(target.match(pred) for pred in predicted_values):
            return False
    return True
def tsv_unescape(x):
    """Unescape strings in the TSV file.
    Escaped characters include:
        newline (0x10) -> backslash + n
        vertical bar (0x7C) -> backslash + p
        backslash (0x5C) -> backslash + backslash

    Args:
        x (str or unicode)
    Returns:
        a unicode
    """
    return x.replace(r'\n', '\n').replace(r'\p', '|').replace('\\\\', '\\')

def tsv_unescape_list(x):
    """Unescape a list in the TSV file.
    List items are joined with vertical bars (0x5C)

    Args:
        x (str or unicode)
    Returns:
        a list of unicodes
    """
    return [tsv_unescape(y) for y in x.split('|')]

## query 改写

In [6]:
from data_loader import TableLoader
table_loader = TableLoader(table_name='tabfact', split='validation', use_sample=True, small_test=False)
from data_loader import TableFormat
import pandas as pd
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.messages import HumanMessage
from langchain.prompts.prompt import PromptTemplate
from utils import normalize_schema
model = ChatOpenAI(model_name="gpt-3.5-turbo-0125", openai_api_base="https://api.chatanywhere.com.cn/v1", openai_api_key="sk-kxgtm71G6zwC44lglIF5CfiEVVzjjc39TOtppkNAwrVA2fUW")
sample = table_loader.normalize_table(
                    table_loader.dataset[130])
print(sample['query'])
formatter = TableFormat(format='none', data=sample)
# schema_information = pd.read_csv(f"result/aug/tabfact_test_schema.csv", index_col='table_id')
# formatter.data = normalize_schema(formatter.data, schema_information.loc[sample['id']]['schema'])
q = """making question in cloze style and decompse question into continuity smaller question."""
amb_instruction = PromptTemplate(input_variables=['query'], template= q + 
"""
Q : {query}
"""
)
["Which country uses the US dollar as its currency and has the Federal Reserve as its central bank?"]

step_back_instruction = PromptTemplate(input_variables=['query'], template= """Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
Query: {query}""")
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
model = ChatOpenAI(model_name='gpt-3.5-turbo-0125', openai_api_base="https://api.chatanywhere.tech/v1",
                       openai_api_key="sk-kxgtm71G6zwC44lglIF5CfiEVVzjjc39TOtppkNAwrVA2fUW", temperature=0.1)
llm_chain = LLMChain(llm=model, prompt=step_back_instruction, verbose=True)
pre_instruction = PromptTemplate(input_variables=["query"], template=
"""
You are an expert at converting user questions into sub-questions. 
Think step by step to answer this question， and provide sub-questions for knowledge that you need. Split the queries with ’;’ and end the queries with ’**’
Question: {query}
Answer: """)
# 

output = llm_chain.invoke(input = {"query" : sample['query']}, verbose=True)



the country of ecuador be the only country that use the us dollar (usd) as its currency , and the central bank be the federal reserve


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYour task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
Query: the country of ecuador be the only country that use the us dollar (usd) as its currency , and the central bank be the federal reserve[0m

[1m> Finished chain.[0m


In [7]:

print(output['text'])

Step-back question: Which country uses the US dollar as its currency and has the Federal Reserve as its central bank?


In [33]:
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from data_loader import TableFormat
query_examples = ["the country of ecuador be the only country that use the us dollar (usd) as its currency , and the central bank be the federal reserve",]
task_examples = ["query rewrite", "query decompose", "query ambiguity resolve"]
new_query_examples = ["Which country uses the US dollar as its currency and has the Federal Reserve as its central bank?",
                    #   "what is the number of listings from barrington?; what is the number of listings from farmington?; what is the number of listings from rochester combined?",
                      ]
examples_prompt = PromptTemplate(input_variables=["query", "task", "new_query"], template=
"""
Query: {query}
new_query: {new_query}""")
num_k = 2
examples_dict = [{"query": query_examples[i],
                                    "new_query": new_query_examples[i]} for i in range(num_k)]
prompt = FewShotPromptTemplate(
    examples=examples_dict,
    example_prompt=examples_prompt,
    prefix="""Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.""",
    suffix=
    """
Query: {query}
    """,
    input_variables=["query"],
)


In [9]:
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
model = ChatOpenAI(model_name='gpt-3.5-turbo-0125', openai_api_base="https://api.chatanywhere.tech/v1",
                       openai_api_key="sk-kxgtm71G6zwC44lglIF5CfiEVVzjjc39TOtppkNAwrVA2fUW", temperature=0.1)
llm_chain = LLMChain(llm=model, prompt=stage_0_prompt, verbose=True)
sample = table_loader.normalize_table(
                    table_loader.dataset[130])
batch_pred = llm_chain.batch([{"query": sample['query']}], return_only_outputs=True)
# "Is the following query true or false?" +



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are an expert at converting user questions into database queries. 
Your task is to effectively decompose complex, multihop questions into simpler, manageable/abstract sub-questions or tasks. This process involves breaking down a question that requires information from multiple sources or steps into broader, more abstract questions that can be answered individually. 


Query: Is the following query true or false? after 2005 , the winner of the lifetime achievement award be andrew rule john silvester , sandra harvey lindsay simpson , marele day , shane maloney , and peter doyle
new_query: Who were the winners of the lifetime achievement award after 2005?; Are the winners andrew rule john silvester , sandra harvey lindsay simpson , marele day , shane maloney , and peter doyle?


Query: what is the number of listings from barrington, farmington, and rochester combined?
new_query: What is the number of list

In [10]:
print(sample['query'])
print(batch_pred[0]['text'])

the country of ecuador be the only country that use the us dollar (usd) as its currency , and the central bank be the federal reserve
new_query: Which country uses the US dollar (USD) as its currency?; Which central bank is associated with the US dollar (USD)?


In [2]:
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from data_loader import TableFormat
query_examples = ["Is the following query true or false? after 2005 , the winner of the lifetime achievement award be andrew rule john silvester , sandra harvey lindsay simpson , marele day , shane maloney , and peter doyle",
                  "what is the number of listings from barrington, farmington, and rochester combined?"]
task_examples = ["query rewrite", "query decompose", "query ambiguity resolve"]
new_query_examples = ["Who were the winners of the lifetime achievement award after 2005?; Are the winners andrew rule john silvester , sandra harvey lindsay simpson , marele day , shane maloney , and peter doyle?",
                      "What is the number of listings from barrington?; What is the number of listings from farmington?; What is the number of listings from rochester combined?",
                      ]
examples_prompt = PromptTemplate(input_variables=["table", "claim", "summary", "subtable"], template=
"""
Query: {query}
new_query: {new_query}""")
num_k = 2
examples_dict = [{"query": query_examples[i],
                                    "task": task_examples[i],
                                    "new_query": new_query_examples[i]} for i in range(num_k)]
stage_0_prompt = FewShotPromptTemplate(
    examples=examples_dict,
    example_prompt=examples_prompt,
    prefix="""You are an expert at converting user questions into database queries. 
Your task is to effectively decompose complex, multihop questions into simpler, manageable/abstract sub-questions or tasks. This process involves breaking down a question that requires information from multiple sources or steps into broader, more abstract questions that can be answered individually. """,
    suffix=
    """Query: {query}
    """,
    input_variables=["query"],
)
# Sub-questions are separated by semicolons.

In [3]:
# row_instruction = PromptTemplate(input_variables=["table", "claim", "aug"], 
#                                  template="""
# Our ultimate goal is to answer query based on the table. Below is a sub-table from the table. Choose from generating a SQLITE3 SELECT SQL code, or directly answering the question. 
# When generating SQL, you are required to infer the data distribution and format from the data of the sub-table. When answering question, you are required to use information from history.
# sub-table: {table}
# Query: {claim}
# Extra information: {aug}
# Output: #output the SQL/answer directly""")
answer_instruction = PromptTemplate(input_variables=["SQL", "table", "claim"], 
                                    template="""
Below is a sub-table generated by excuting the SQL. You need to understand the logic behind the SQL filtering and complete task using the final sub-table. 
SQL Excuted: 
```{SQL}```
Sub-table: {table}
Query: {claim}
answer the last question given in the query. Only return the string instead of other format information. Do not repeat the question.
""" )
muilti_answer_instruction = PromptTemplate(input_variables=["information", "claim"], 
                                    template="""
Below are some sub-tables generated by excuting the SQL. You need to understand the logic behind the SQL filtering and the result sub-table. Complete task using the final sub-table. 
{information}
Query: {claim}
verify whether the provided claim/query is true or false, return 0 if it's false, or 1 if it's true. Please think step by step and only return 0 or 1 without any other information at last.
""" )

In [29]:

from prompt_manager import get_k_shot_with_aug, get_k_shot_with_answer, view_instruction
import pandas as pd
from utils import parse_specific_composition
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
from data_loader import TableFormat, TableLoader
from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from sqlalchemy import create_engine
from executor import SQLManager
import sqlparse
task_name = 'tabfact'
split = 'test'
model_name = 'gpt-3.5-turbo-0125'
model = ChatOpenAI(model_name=model_name, openai_api_base="https://api.chatanywhere.tech/v1",
                       openai_api_key="sk-kxgtm71G6zwC44lglIF5CfiEVVzjjc39TOtppkNAwrVA2fUW", temperature=0.1)
schema_information = pd.read_csv(f"result/aug/{task_name}_{split}_schema.csv", index_col='table_id')
aug_information = pd.read_csv(f"result/aug/{task_name}_{split}_summary.csv", index_col='table_id')
composition_information = pd.read_csv(f"result/aug/{task_name}_{split}_composition.csv", index_col='table_id')
engine = create_engine('sqlite:///db/sqlite/tabfact.db', echo=False)
manager = SQLManager(engine=engine)
def end2end(sample):
    # stage 0: query augmentation
    llm_chain = LLMChain(llm=model, prompt=stage_0_prompt, verbose=True)
    stage_0_batch_pred = llm_chain.batch([{"query": sample['query']}], return_only_outputs=True)[0]['text'].split(':')[-1]
    print(stage_0_batch_pred)
    sub_queries = stage_0_batch_pred.split(';')
    sub_queries[-1] += "verify whether the provided claim/query are true or false. Return 0 if it's false, or 1 if it's true. Only return 0 or 1 without any other information." 
    # stage 1: column pick
    formatter = TableFormat(format='none', data=sample, use_sampling=True)
    for query in sub_queries:
        k_shot_prompt = get_k_shot_with_aug()
        llm_chain = LLMChain(llm=model, prompt=k_shot_prompt, verbose=False)
        summary_aug, column_aug = aug_information.loc[sample['id']]['summary'], aug_information.loc[sample['id']]['column_description'] 
        stage_1_batch_pred = llm_chain.batch([dict({'table': formatter.format_html(table_caption=sample['table']['caption']),
                                            'claim': query,
                                            'aug':  summary_aug
                                            })], return_only_outputs=True)[0]['text']
        print(stage_1_batch_pred)
        # stage 2: SQL generation
        # k_shot_prompt = row_instruction
        # llm_chain = LLMChain(llm=model, prompt=k_shot_prompt, verbose=True)
        columns = [formatter.normalize_col_name(c.strip()) for c in stage_1_batch_pred.split(',')]

        #     # formatter.normalize_schema(schema_information.loc[sample['id']]['schema'])
        formatter.data = formatter.data.loc[:, columns]
        
        extra_information = '\n'.join(parse_specific_composition(composition_information.loc[sample['id']]['composition'], formatter.data.columns))
        # stage_2_batch_pred = llm_chain.batch([dict({'table': formatter.format_html(table_caption=sample['table']['caption']),
        #                                     'claim': query,
        #                                     'aug':  extra_information
        #                                     })], return_only_outputs=True)[0]['text']
        # print(stage_2_batch_pred)
        # stage 3: SQL Excution
        # k_shot_prompt = answer_instruction
        prompt = ChatPromptTemplate.from_messages(
    [
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}")
    ]
)
        chain = prompt | model
        formatter.normalize_schema(schema_information.loc[sample['id']]['schema'])
        # formatter.data = manager.execute_from_df(stage_2_batch_pred, formatter.all_data, table_name='DF')
        demo_chat_history = ChatMessageHistory()
        input1 = row_instruction.format(**dict({'table': formatter.format_html(table_caption=sample['table']['caption']),
                                        'claim': query,
                                        'aug':  extra_information,
                                        }))
        chain_with_message_history = RunnableWithMessageHistory(
    chain,
    lambda session_id: demo_chat_history,
    input_messages_key="input",
    history_messages_key="chat_history",
)
    response = chain_with_message_history.invoke(
    {"input": input1},
    {"configurable": {"session_id": "unused"}},
)   
    print(response)
    sql_type = sqlparse.parse(response.content)[0]
    if sql_type == "SELECT":
        formatter.data = manager.execute_from_df(response.content, formatter.all_data, table_name='DF')
    

In [34]:
import sqlparse
parsed = sqlparse.parse('the answer is False')[0]
sql_type = parsed.get_type()

In [30]:
table_loader = TableLoader(table_name='tabfact', split='test', use_sample=True, small_test=True)
sample = table_loader.normalize_table(
                    table_loader.dataset[2])
end2end(sample)




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are an expert at converting user questions into database queries. 
Your task is to effectively decompose complex, multihop questions into simpler, manageable/abstract sub-questions or tasks. This process involves breaking down a question that requires information from multiple sources or steps into broader, more abstract questions that can be answered individually. 


Query: Is the following query true or false? after 2005 , the winner of the lifetime achievement award be andrew rule john silvester , sandra harvey lindsay simpson , marele day , shane maloney , and peter doyle
new_query: Who were the winners of the lifetime achievement award after 2005?; Are the winners andrew rule john silvester , sandra harvey lindsay simpson , marele day , shane maloney , and peter doyle?


Query: what is the number of listings from barrington, farmington, and rochester combined?
new_query: What is the number of list

In [1]:
!export LANGCHAIN_TRACING_V2="true"
!export LANGCHAIN_API_KEY="ls__83cde5e136ad42dc857aeb7bf791dd3a"

In [17]:
from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
chat = ChatOpenAI(model_name=model_name, openai_api_base="https://api.chatanywhere.tech/v1",
                       openai_api_key="sk-kxgtm71G6zwC44lglIF5CfiEVVzjjc39TOtppkNAwrVA2fUW", temperature=0.1)
prompt = ChatPromptTemplate.from_messages(
    [
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", """Below is a sub-table generated by excuting the SQL. You need to understand the logic behind the SQL filtering and complete task using the final sub-table. 
SQL Excuted: 
```{SQL}```
Sub-table: {table}
Query: {claim}
answer the last question given in the query. Only return the string instead of other format information. Do not repeat the question.
""" ),
    ]
)

chain = prompt | chat

In [18]:
from langchain_core.runnables.history import RunnableWithMessageHistory

demo_ephemeral_chat_history_for_chain = ChatMessageHistory()

chain_with_message_history = RunnableWithMessageHistory(
    chain,
    lambda session_id: demo_ephemeral_chat_history_for_chain,
    input_messages_key=["table", "claim", "SQL"],
    history_messages_key="chat_history",
)

ValidationError: 1 validation error for RunnableWithMessageHistory
input_messages_key
  str type expected (type=type_error.str)

In [6]:
chain_with_message_history.invoke(
    {"table": "test-table", "claim": "test-claim", "SQL": "test-SQL"},
    {"configurable": {"session_id": "unused"}},
)

AIMessage(content='The translation of "I love programming" in French is "J\'adore la programmation."', response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 39, 'total_tokens': 59}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None})

In [7]:
chain_with_message_history.invoke(
    {"input": "What did I just ask you?"}, {"configurable": {"session_id": "unused"}}
)

AIMessage(content='You just asked me to translate the sentence "I love programming" from English to French.', response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 74, 'total_tokens': 92}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None})

In [None]:

from prompt_manager import get_k_shot_with_aug, get_k_shot_with_answer, view_instruction
import pandas as pd
from utils import parse_specific_composition
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
from data_loader import TableFormat, TableLoader
from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from sqlalchemy import create_engine
from executor import SQLManager
task_name = 'tabfact'
split = 'test'
model_name = 'gpt-3.5-turbo-0125'
model = ChatOpenAI(model_name=model_name, openai_api_base="https://api.chatanywhere.tech/v1",
                       openai_api_key="sk-kxgtm71G6zwC44lglIF5CfiEVVzjjc39TOtppkNAwrVA2fUW", temperature=0.1)
schema_information = pd.read_csv(f"result/aug/{task_name}_{split}_schema.csv", index_col='table_id')
aug_information = pd.read_csv(f"result/aug/{task_name}_{split}_summary.csv", index_col='table_id')
composition_information = pd.read_csv(f"result/aug/{task_name}_{split}_composition.csv", index_col='table_id')
engine = create_engine('sqlite:///db/sqlite/tabfact.db', echo=False)
manager = SQLManager(engine=engine)
def end2end(sample):
    # stage 0: query augmentation
    llm_chain = LLMChain(llm=model, prompt=stage_0_prompt, verbose=True)
    stage_0_batch_pred = llm_chain.batch([{"query": sample['query']}], return_only_outputs=True)[0]['text'].split(':')[-1]
    print(stage_0_batch_pred)
    sub_queries = stage_0_batch_pred.split(';')
    sub_queries[-1] += "verify whether the provided claim/query are true or false. Return 0 if it's false, or 1 if it's true. Only return 0 or 1 without any other information." 
    # stage 1: column pick
    formatter = TableFormat(format='none', data=sample, use_sampling=True)
    for query in sub_queries:
        k_shot_prompt = get_k_shot_with_aug()
        llm_chain = LLMChain(llm=model, prompt=k_shot_prompt, verbose=True)
        summary_aug, column_aug = aug_information.loc[sample['id']]['summary'], aug_information.loc[sample['id']]['column_description'] 
        stage_1_batch_pred = llm_chain.batch([dict({'table': formatter.format_html(table_caption=sample['table']['caption']),
                                            'claim': query,
                                            'aug':  summary_aug
                                            })], return_only_outputs=True)[0]['text']
        print(stage_1_batch_pred)
        # stage 2: SQL generation
        # k_shot_prompt = row_instruction
        # llm_chain = LLMChain(llm=model, prompt=k_shot_prompt, verbose=True)
        # columns = [formatter.normalize_col_name(c.strip()) for c in stage_1_batch_pred.split(',')]

        #     # formatter.normalize_schema(schema_information.loc[sample['id']]['schema'])
        # formatter.data = formatter.data.loc[:, columns]
        
        # extra_information = '\n'.join(parse_specific_composition(composition_information.loc[sample['id']]['composition'], formatter.data.columns))
        # stage_2_batch_pred = llm_chain.batch([dict({'table': formatter.format_html(table_caption=sample['table']['caption']),
        #                                     'claim': query,
        #                                     'aug':  extra_information
        #                                     })], return_only_outputs=True)[0]['text']
        # print(stage_2_batch_pred)
        # stage 3: SQL Excution
        # k_shot_prompt = answer_instruction
        prompt = ChatPromptTemplate.from_messages(
    [
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}")
    ]
)
        chain = prompt | model
        formatter.normalize_schema(schema_information.loc[sample['id']]['schema'])
        formatter.data = manager.execute_from_df(stage_2_batch_pred, formatter.all_data, table_name='DF')
        demo_chat_history = ChatMessageHistory()
        input1 = row_instruction.format(**dict({'table': formatter.format_html(table_caption=sample['table']['caption']),
                                        'claim': query,
                                        'aug':  extra_information,
                                        }))
        chain_with_message_history = RunnableWithMessageHistory(
    chain,
    lambda session_id: demo_chat_history,
    input_messages_key="input",
    history_messages_key="chat_history",
)
    chain_with_message_history.invoke(
    {"input": input1},
    {"configurable": {"session_id": "unused"}},
)

## SQL composition

Logic:
Transform the input to the grammarly correct and executable SQL

- 1. RULE based
- 2. LLM based 

```

kanji<DELETE>, name<KEEP>, builder<GROUP BY>, laid down<DELETE>, launched<DELETE>, completed<DELETE>

```

TODO: where部分如何操作

In [7]:
import random 
import string
import datetime
random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20)
datetime.datetime.now().strftime('%m-%d_%H-%M-%S')

'04-02_06-08-10'

In [9]:
from sqlbuilder.query import SELECT, C, T
from sqlbuilder.dummy import dummy_connection, dummy_context
str(SELECT(C.column_name, C.another_column, 123, 'abc').FROM(T.table_name)._as_sql(dummy_connection, dummy_context))
a='test'


<SELECT 'SELECT a', ()>

In [11]:
import re
test_sample = 'round<KEEP> clubs remaining<KEEP> clubs involved<KEEP> winners from previous round<KEEP> new entries this round<DELETE> leagues entering at this round<KEEP>'
matches = re.finditer(r'([^<]*)<([^\s>]*)>', test_sample)
for match in matches:
    print(match.group(0))
    # print(match.group[0])
    print(match.group(1), match.group(2))

round<KEEP>
round KEEP
 clubs remaining<KEEP>
 clubs remaining KEEP
 clubs involved<KEEP>
 clubs involved KEEP
 winners from previous round<KEEP>
 winners from previous round KEEP
 new entries this round<DELETE>
 new entries this round DELETE
 leagues entering at this round<KEEP>
 leagues entering at this round KEEP


In [19]:
import re
import sqlparse
def format_sql(output:str):
    """
    Format subtable output into SQL
    """
    items = output.split(',')
    AGG = ['COUNT', 'AVG', 'SUM', 'MAX', 'MIN', 'KEEP']
    complex = ['GROUP BY', 'ORDER BY']
    select_content = []
    complex_content = []
    for text in items:
        match = re.search(r'(\w+)\s*<([^>]*)>', text)
        item = match.group(1).strip()
        dml = match.group(2).strip()
        if dml in AGG:
            if dml == 'KEEP':
                select_content.append(f'{item}')
            else:
                select_content.append(dml + f'({item})')
        if dml in complex:
            # if dml == 'GROUP BY':
            #     complex_content.append(dml + f' {item}')
            #     select_content.append(f'{item}')
            complex_content.append(dml + f' {item}')
            select_content.append(f'{item}')
    return sqlparse.format('SELECT ' + ' '.join(select_content) + ' '+' '.join(complex_content), keyword_case='upper', reindent=True)

            
    

SELECT name builder
GROUP BY builder


In [4]:
manager.format_sql(test_sample)

'SELECT name builder\nGROUP BY builder'

In [8]:
from sqlalchemy import create_engine 
from sqlalchemy.orm import Session,sessionmaker
engine = create_engine('sqlite:///db/sqlite/tabfact.db', echo=True)
session = Session(engine)

In [19]:
for i in sqlparse.parse('select * from DF;'):
    print(i.tokens)

[<DML 'select' at 0x7FAB15075240>, <Whitespace ' ' at 0x7FAB15074C40>, <Wildcard '*' at 0x7FAB15074D00>, <Whitespace ' ' at 0x7FAB15074F40>, <Keyword 'from' at 0x7FAB15074EE0>, <Whitespace ' ' at 0x7FAB15074160>, <Identifier 'DF' at 0x7FAB14CDCCF0>, <Punctuation ';' at 0x7FAB150751E0>]


In [2]:
import sqlparse
from typing import List, Any
from sqlalchemy import text
def __sql_parse(sql):
        sql = sql.strip()
        parsed = sqlparse.parse(sql)[0]
        sql_type = parsed.get_type()

        table_name = parsed.get_name()

        first_token = parsed.token_first(skip_ws=True, skip_cm=False)
        ttype = first_token.ttype
        print(
            f"SQL:{sql}, ttype:{ttype}, sql_type:{sql_type}, table:{table_name}"
        )
        return parsed, ttype, sql_type, table_name 
    
def _query(query: str, session, fetch: str = "all"):
        """Run a SQL query and return the results as a list of tuples.

        Args:
            query (str): SQL query to run
            fetch (str): fetch type
        """
        result: List[Any] = []

        print(f"Query[{query}]")
        if not query:
            return result
        cursor = session.execute(text(query))
        if cursor.returns_rows:
            if fetch == "all":
                result = cursor.fetchall()
            elif fetch == "one":
                result = [cursor.fetchone()]
            else:
                raise ValueError("Fetch parameter must be either 'one' or 'all'")
            field_names = tuple(i[0:] for i in cursor.keys())

            result.insert(0, field_names)
            return result
        
def get_simple_fields(self, table_name):
        """Get column fields about specified table."""
        return _query(f"SHOW COLUMNS FROM {table_name}")
    
def run(command: str, session, fetch: str = "all") -> List:
    """Execute a SQL command and return a string representing the results."""
    if not command or len(command) < 0:
        return []
    parsed, ttype, sql_type, table_name = __sql_parse(command)
    if ttype == sqlparse.tokens.DML:
        if sql_type == "SELECT":
            return _query(command, session, fetch)
        else:
            return get_simple_fields(table_name)
        
        


In [2]:
preds = []
import json
with open('./result/data/tabfact_test_04-07_08-05-52.json', 'r') as f:
    lines = f.readlines()
    for l in lines:
        preds.append(json.loads(l)['pred'])
    print(len(preds))

22


### coposition augmentation

In [4]:
from prompt_manager import get_k_shot_with_answer, view_instruction, row_instruction
import pandas as pd
from utils import parse_specific_composition
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
from data_loader import TableFormat, TableLoader
from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from sqlalchemy import create_engine
from executor import SQLManager
import sqlparse
task_name = 'tabfact'
split = 'test'
model_name = 'gpt-3.5-turbo-0125'
model = ChatOpenAI(model_name=model_name, openai_api_base="https://api.chatanywhere.tech/v1",
                       openai_api_key="sk-kxgtm71G6zwC44lglIF5CfiEVVzjjc39TOtppkNAwrVA2fUW", temperature=0.1)
schema_information = pd.read_csv(f"result/aug/{task_name}_{split}_schema.csv", index_col='table_id')
aug_information = pd.read_csv(f"result/aug/{task_name}_{split}_summary.csv", index_col='table_id')
composition_information = pd.read_csv(f"result/aug/{task_name}_{split}_composition.csv", index_col='table_id')
engine = create_engine('sqlite:///db/sqlite/tabfact.db', echo=False)
manager = SQLManager(engine=engine)

In [17]:
TableFormat(format='none', data=table_loader.dataset[20], use_sampling=True).data.reset_index()

Unnamed: 0,index,goal,date,venue,score,result,competition
0,0,1,"september 4 , 2001","estadio nacional de chile , santiago , chile",0 - 1,0 - 2,2002 world cup qualification
1,1,2,"november 20 , 2002","brígido iriarte , caracas , venezuela",1 - 0,1 - 0,friendly
2,5,6,"june 26 , 2007","pueblo nuevo , san cristóbal , venezuela",2 - 1,2 - 2,2007 copa américa


In [23]:
table_loader = TableLoader(table_name='tabfact', split='validation', use_sample=False)
example_data = TableFormat(format='none', data=table_loader.dataset[20], use_sampling=True).data.iloc[:, [0,1,2,3,5]].reset_index(drop=True)
example_data['nation'] = TableFormat(format='none', data=table_loader.dataset[130], use_sampling=True).data.iloc[:, 1].reset_index(drop=True)

In [25]:
example_data['nation'] = TableFormat(format='none', data=table_loader.dataset[130], use_sampling=True).data.iloc[:, 1].reset_index(drop=True)

In [26]:
example_data

Unnamed: 0,goal,date,venue,score,competition,nation
0,1,"september 4 , 2001","estadio nacional de chile , santiago , chile",0 - 1,2002 world cup qualification,hungary (hun)
1,2,"november 20 , 2002","brígido iriarte , caracas , venezuela",1 - 0,friendly,east germany (gdr)
2,6,"june 26 , 2007","pueblo nuevo , san cristóbal , venezuela",2 - 1,2007 copa américa,poland (pol)


In [14]:
pd.concat([example_data, TableFormat(format='none', data=table_loader.dataset[130], use_sampling=True).data.iloc[:, 1]],axis=1)


Unnamed: 0,goal,date,venue,score,competition,nation
0,1.0,"september 4 , 2001","estadio nacional de chile , santiago , chile",0 - 1,2002 world cup qualification,east germany (gdr)
1,2.0,"november 20 , 2002","brígido iriarte , caracas , venezuela",1 - 0,friendly,
5,6.0,"june 26 , 2007","pueblo nuevo , san cristóbal , venezuela",2 - 1,2007 copa américa,hungary (hun)
9,,,,,,poland (pol)


In [None]:
def get_k_shot_with_content(k: int = 2):
    table_loader = TableLoader(table_name='tabfact', split='validation', use_sample=False)
    Output_examples = ["""
                       
                       """]
    examples_prompt = PromptTemplate(input_variables=["table", "claim", "summary", "output"], template="""
    Table: {table}
    Output: {example}""")
    table_loader = TableLoader(table_name='tabfact', split='validation', use_sample=False)
    example_data = TableFormat(format='none', data=table_loader.dataset[20], use_sampling=True).data.iloc[:, [0,1,2,3,5]].reset_index(drop=True)
    example_data['nation'] = TableFormat(format='none', data=table_loader.dataset[130], use_sampling=True).data.iloc[:, 1].reset_index(drop=True)
    examples_dict = [{"table": TableFormat(format='none', data=example_data, use_sampling=True).format_html(),
                      "example": Output_examples[i]} for i in range(k)]
    prompt_template = FewShotPromptTemplate(
        examples=examples_dict,
        example_prompt=examples_prompt,
        prefix=
    """Below is a subtable with rows sampled, your task is to summarize the content and find commonalities in each column.
    Refine commonalities about the contents within each table column. The example is below:""",
        suffix="""Table: {table}""",
        input_variables=["table"],
    )
    return prompt_template

In [2]:
def get_k_shot_with_string(k: int = 2):
    table_loader = TableLoader(table_name='tabfact', split='validation', use_sample=False)
    Output_examples = [
        # """leagues_entering_at_this_round: different league name joint with '&' or None value""",
                       """
    goal: sequential number like 1, 2, 3...
    date: date in the format of Y-M-D
    venue: venue in the format of location, city, country
    score: score number in the format of X-Y
    result: result number in the format of X-Y
    competition: competition name or friendly
    nation: nation name with abbreviation within parentheses"""]
    examples_prompt = PromptTemplate(input_variables=["table", "claim", "summary", "output"], template="""
    Table: {table}
    Output: {example}""")
    table_loader = TableLoader(table_name='tabfact', split='validation', use_sample=False)
    example_data = TableFormat(format='none', data=table_loader.dataset[20], use_sampling=True).data.iloc[:, [0,1,2,3,5]].reset_index(drop=True)
    example_data['nation'] = TableFormat(format='none', data=table_loader.dataset[130], use_sampling=True).data.iloc[:, 1].reset_index(drop=True)
    examples_dict = [{"table": TableFormat(format='none', data=example_data, use_sampling=True).format_html(),
                      "example": Output_examples[i]} for i in range(k)]
    prompt_template = FewShotPromptTemplate(
        examples=examples_dict,
        example_prompt=examples_prompt,
        prefix=
    """Below is a subtable with rows sampled, your task is to summarize and synthesize each column in the table, identifying commonalities in the string representations, and ultimately output string format commanalities for each column.
    The example is below:""",
        suffix="""Table: {table}""",
        input_variables=["table"],
    )
    return prompt_template

In [7]:
table_loader = TableLoader(table_name='tabfact', split='test', use_sample=False, small_test=True)
table_loader = TableLoader(table_name='tabfact', split='validation', use_sample=False, small_test=False)
sample = table_loader.normalize_table(table_loader.dataset[38])
formatter = TableFormat(format='none',data = sample, use_sampling=True)
pre_instruction_com = PromptTemplate(input_variables=["table"], template="""
Below is a subtable with rows sampled, your task is to summarize and synthesize each column in the table, identifying commonalities in the string representations, and ultimately abstracting a text template.
Table: {table}
""")
temp = get_k_shot_with_string(k = 1)
chain = LLMChain(llm=model, prompt=temp, verbose=True)
output  = chain.batch([formatter.format_html()],return_only_outputs=True)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mBelow is a subtable with rows sampled, your task is to summarize and synthesize each column in the table, identifying commonalities in the string representations, and ultimately output string format commanalities for each column.
    The example is below:


    Table: <table>
<thead>
<tr><th>  goal</th><th>              date</th><th>                                       venue</th><th>  score</th><th>                 competition</th><th>            nation</th></tr>
</thead>
<tbody>
<tr><td>1     </td><td>september 4 , 2001</td><td>estadio nacional de chile , santiago , chile</td><td>0 - 1  </td><td>2002 world cup qualification</td><td>hungary (hun)     </td></tr>
<tr><td>2     </td><td>november 20 , 2002</td><td>brígido iriarte , caracas , venezuela       </td><td>1 - 0  </td><td>friendly                    </td><td>east germany (gdr)</td></tr>
<tr><td>6     </td><td>june 26 , 2007    </td><td>pueblo nuevo

In [8]:
print(output[0]['text'])

Output:
rank: sequential number like 1, 2, 3...
rider: rider's name
team: team name
speed: speed in mph
time: time in the format of H:M:S.SS


In [None]:
def batch_composition_aug(self, formatter: TableFormat, batch_data, batch_size: int, output_token=False, schema_information=None):
        """
        batch composition augmentation
        """
        pre_instruction_com = PromptTemplate(input_variables=["table"], template="""
        Below is a subtable with rows sampled, your task is to summarize and synthesize each column in the table, identifying commonalities at the character level, and ultimately abstracting a text template.
        You need to output the template as a row in the table. The format should be as followed:
        COLUMN_NAME1 | COLUMN_NAME2 ...
        COLUMN_FORMAT1 | COLUMN_FORMAT2 ... 
        
        Table: {table}
        """)
        com_list = []
        llm_chain = LLMChain(
            llm=self.llm, prompt=pre_instruction_com, verbose=False)
        with get_openai_callback() as cb:
            # add schema augmentaion info first
            if schema_information is not None:
                batch_pred = llm_chain.batch([formatter.load_data_from_dic(batch_data[i], schema_information=schema_information.loc[batch_data[i]['id']]['schema']).format_html(
                    batch_data[i]['caption']) for i in range(batch_size)], return_only_outputs=True)
            else:
                batch_pred = llm_chain.batch([formatter.load_data_from_dic(batch_data[i]).format_html(
                    batch_data[i]['caption']) for i in range(batch_size)], return_only_outputs=True)
        for i in range(len(batch_pred)):
            parts = batch_pred[i]['text']
            com_list.append(parts)
        if output_token:
            logger.info(
                f"Batch Composition Augmentaion  All Tokens: {cb.total_tokens}")
            logger.info(
                f"Batch Composition Augmentaion Tokens Average: {cb.total_tokens / batch_size if batch_size > 0 else 0}" )
        return com_list

## decomposition 训练

In [5]:
from prompt_manager import get_k_shot_with_answer, view_instruction, row_instruction
import pandas as pd
from utils import parse_specific_composition
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
from data_loader import TableFormat, TableLoader
from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from sqlalchemy import create_engine
from executor import SQLManager
import sqlparse
embeddings = HuggingFaceBgeEmbeddings(
            model_name='BAAI/bge-large-en',
            model_kwargs={'device': 'cuda:3', 'trust_remote_code': True},
            encode_kwargs={'normalize_embeddings': True})

In [10]:
table_loader = TableLoader(table_name='wikitable', split='train', use_sample=False)
model = ChatOpenAI(model_name='gpt-3.5-turbo-0125', openai_api_base="https://api.chatanywhere.tech/v1",
                       openai_api_key="sk-WZtqZEeuE0Xb6syVghDgAxdwe0ASWLkQRGxl61UI7B9RqNC4", temperature=0.1)

pre_instruction = PromptTemplate(input_variables=["query"], template=
"""
Given the question and the answer based on the table, you need to answer whether the information is helpful to answer the original question. Think step by step and only return True/False at last.
""")
# llm_chain = LLMChain(llm=model, prompt=pre_instruction, verbose=True)

In [11]:
sample = table_loader.normalize_table(
                        table_loader.dataset[0])
all_tokens = 0
all_queries = []
formatter = TableFormat(format='none', data=sample, save_embedding=True, embeddings=embeddings)
sample_data = formatter.get_sample_data(sample_type='embedding', query=sample['query'], k=5)
Zero_shot_prompt = PromptTemplate(input_variables=["table", "claim", "aug"], 
                                  template="""
Below is a subtable with columns filtered, you are required to infer the data distribution and format from the sample data of the sub-table. Carefully analyze the query, answer the question given in the query. Only return the string instead of other format information. Do not repeat the question.
sub-table: {table}
Query: {claim}
Output: """
)
llm_chain = LLMChain(llm=model, prompt=Zero_shot_prompt, verbose=True)
llm_chain.batch([{"table": TableFormat.format_html(data=sample_data, table_caption=sample['table']['caption']), "claim": sample['query']}])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Below is a subtable with columns filtered, you are required to infer the data distribution and format from the sample data of the sub-table. Carefully analyze the query, answer the question given in the query. Only return the string instead of other format information. Do not repeat the question.
sub-table: <table>
<caption>Portland Timbers (2001–10)</caption>
<thead>
<tr><th>  Year</th><th>  Division</th><th>            League</th><th>  Regular_Season</th><th>       Playoffs</th><th>       Open_Cup</th><th>  Avg_Attendance</th></tr>
</thead>
<tbody>
<tr><td>2002  </td><td>2         </td><td>USL A-League      </td><td>2nd, Pacific    </td><td>1st Round      </td><td>Did not qualify</td><td>6,260           </td></tr>
<tr><td>2001  </td><td>2         </td><td>USL A-League      </td><td>4th, Western    </td><td>Quarterfinals  </td><td>Did not qualify</td><td>7,169           </td></tr>
<tr><td>2003  </td><td>

[{'table': '<table>\n<caption>Portland Timbers (2001–10)</caption>\n<thead>\n<tr><th>  Year</th><th>  Division</th><th>            League</th><th>  Regular_Season</th><th>       Playoffs</th><th>       Open_Cup</th><th>  Avg_Attendance</th></tr>\n</thead>\n<tbody>\n<tr><td>2002  </td><td>2         </td><td>USL A-League      </td><td>2nd, Pacific    </td><td>1st Round      </td><td>Did not qualify</td><td>6,260           </td></tr>\n<tr><td>2001  </td><td>2         </td><td>USL A-League      </td><td>4th, Western    </td><td>Quarterfinals  </td><td>Did not qualify</td><td>7,169           </td></tr>\n<tr><td>2003  </td><td>2         </td><td>USL A-League      </td><td>3rd, Pacific    </td><td>Did not qualify</td><td>Did not qualify</td><td>5,871           </td></tr>\n<tr><td>2004  </td><td>2         </td><td>USL A-League      </td><td>1st, Western    </td><td>Quarterfinals  </td><td>4th Round      </td><td>5,628           </td></tr>\n<tr><td>2006  </td><td>2         </td><td>USL First Di

In [25]:
ord('-')

45

In [24]:
ord('–')

8211

In [23]:
formatter.all_data['Year'].apply(lambda x: x +'hhh')

0    2001hhh
1    2002hhh
2    2003hhh
3    2004hhh
4    2005hhh
5    2006hhh
6    2007hhh
7    2008hhh
8    2009hhh
9    2010hhh
Name: Year, dtype: object

In [19]:
def time_to_minutes(time_str):
   # 分割字符串以获取小时、分钟和秒
   hours, minutes_seconds = time_str.split(":")
   minutes, seconds = minutes_seconds.split(".")
   
   # 将小时、分钟和秒转换为分钟
   total_minutes = int(hours) * 60 + int(minutes) + float(seconds)
   
   return total_minutes
time_to_minutes("1:03.49")


63.81666666666667

In [18]:
a

datetime.datetime(2038, 5, 8, 0, 0)