In [None]:
from llama_cpp import Llama
import pandas as pd
import duckdb, os

In [None]:
# Path to your GGUF model
MODEL_PATH = "models/llama-3-sqlcoder-8b.Q6_K.gguf"

# llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=6)
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=1024,  # Lower context size if RAM is an issue
    n_threads=6,
    n_gpu_layers=20,  # Safer for 8GB GPU
    verbose=True
)

In [None]:
# Load CSV into DuckDB
df = pd.read_csv("./data/llm_dataset_v10.gz")
con = duckdb.connect()
con.register("sales_data", df)

In [None]:
# Prompt template
def build_prompt(nlq):
    schema = "sales_data(region TEXT, quarter TEXT, sales INT)"
    prompt = f"""### You are an expert Postgres SQL generator.
### Given the following table schema:
# {schema}

### Write a SQL query to answer the question:
# {nlq}

### SQL:
"""
    return prompt

In [None]:
# Query model
def generate_sql(prompt):
    output = llm(prompt, temperature=0, max_tokens=256)
    text = output["choices"][0]["text"]

    if "SELECT" not in text.upper():
        print("❌ 'SELECT' not found in model output. Raw output:")
        print(text)
        return None

    # Try to extract SQL statement cleanly
    try:
        sql = "SELECT " + text.upper().split("SELECT", 1)[1].split(";")[0].strip() + ";"
        return sql
    except Exception as e:
        print("❌ Error while parsing SQL:", e)
        print("Raw model output:")
        print(text)
        return None

In [None]:
# Run query
def run_nlq(nlq):
    prompt = build_prompt(nlq)
    sql = generate_sql(prompt)

    if not sql:
        print("\n⚠️ Could not generate valid SQL.")
        return

    print("\n📜 Generated SQL:")
    print(sql)

    try:
        result = con.execute(sql).fetchdf()
        print("\n📊 Query Result:")
        print(result)
    except Exception as e:
        print("\n❌ SQL Execution Error:")
        print(e)

In [None]:
run_nlq("What were the total sales in Q3 for the Northeast?")

In [None]:
con.execute("SELECT * FROM sales_data LIMIT 5").fetchdf()

In [None]:
sql = "SELECT SUM(sales) AS total_primary_sales FROM sales_data WHERE (month >= 7 AND month <= 9) AND year = 2024"
print("\n📜 Executing SQL:")
con.execute(sql).fetchdf()

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os

# Set your desired local directory for saving the model
LOCAL_DIR = "./models/t5_small"

# Step 1: Download and save model/tokenizer locally (only needs to be done once)
def download_and_save_model():
    if not os.path.exists(LOCAL_DIR):
        os.makedirs(LOCAL_DIR)
        print("Downloading T5-small model and tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained("t5-small")
        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
        tokenizer.save_pretrained(LOCAL_DIR)
        model.save_pretrained(LOCAL_DIR)
        print(f"Model saved to {LOCAL_DIR}")
    else:
        print("Model already downloaded.")

# Step 2: Load model/tokenizer from local dir
def load_model_from_local():
    print("Loading model from local directory...")
    tokenizer = AutoTokenizer.from_pretrained(LOCAL_DIR)
    model = AutoModelForSeq2SeqLM.from_pretrained(LOCAL_DIR)
    return tokenizer, model

# Step 3: Summarize a table (as text)
def summarize_table(tokenizer, model, table_text):
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

    prompt = "Summarize the following table:\n" + table_text

    result = pipe(prompt, max_length=100, do_sample=False)[0]['generated_text']
    return result

# Example table (as markdown-style text)
example_table = """
| Year | Product | Sales | Region |
|------|---------|-------|--------|
| 2023 | A       | 1200  | North  |
| 2023 | B       | 900   | South  |
| 2024 | A       | 1400  | North  |
| 2024 | B       | 1100  | South  |
"""

# Run everything
if __name__ == "__main__":
    download_and_save_model()
    tokenizer, model = load_model_from_local()
    summary = summarize_table(tokenizer, model, example_table)

    print("\n🔍 Summary of the Table:")
    print(summary)

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
tokenizer = AutoTokenizer.from_pretrained("./models/bloom_totto")
model = AutoModelForCausalLM.from_pretrained("./models/bloom_totto")

device = 0 if torch.cuda.is_available() else -1
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

The model 'BloomForCausalLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


In [5]:
def linearize_df(df):
    # Header row
    header = " | ".join(df.columns) + " ; "
    
    # Data rows
    rows = []
    for _, row in df.iterrows():
        # Convert each value to string and join
        row_str = " | ".join(str(x) for x in row.values)
        rows.append(row_str)
    
    # Combine all rows with ';' separator
    table_str = header + " ; ".join(rows)
    return table_str


In [8]:
import pandas as pd
data = {
    "Year": [2023, 2023, 2024, 2024],
    "Product": ["A", "B", "A", "B"],
    "Sales": [1200, 900, 1400, 1100],
    "Region": ["North", "South", "North", "South"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Year,Product,Sales,Region
0,2023,A,1200,North
1,2023,B,900,South
2,2024,A,1400,North
3,2024,B,1100,South


In [9]:
table_input = linearize_df(df)

# Generate summary/description
result = summarizer(table_input, max_length=150, do_sample=False)

print("Generated Summary:")
print(result[0]['summary_text'])  # or 'generated_text' depending on the pipeline

Your max_length is set to 150, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)


Generated Summary:
Year | Product | Sales | Region ; 2023 | A | 1200 | North ; 2023 | B | 900 | South ; 2024 | A | 1400 | North ; 2024 | B | 1100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 | South ; 2100 |


In [10]:
table_input

'Year | Product | Sales | Region ; 2023 | A | 1200 | North ; 2023 | B | 900 | South ; 2024 | A | 1400 | North ; 2024 | B | 1100 | South'

In [7]:
from llama_cpp import Llama
import pandas as pd

# Load your model (update the path to your GGUF model)
llm = Llama(model_path="./models/Mistral-7B-Instruct-v0.1.Q6_K.gguf", n_ctx=2048)

# Sample DataFrame
df = pd.DataFrame({
    'Date': ['2025-01-01', '2025-01-02', '2025-01-03'],
    'Sales': [100, 200, 150],
    'Product': ['A', 'A', 'B']
})

# Format as text
table_text = df.to_markdown(index=False)

# Prompt
prompt = f"Analyze the following table and summarize key patterns:\n\n{table_text}"

# Call Mistral
response = llm(prompt, max_tokens=256, stop=["</s>"])
print(response["choices"][0]["text"])

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 



| 2025-01-04 |     250 | B         |
| 2025-01-05 |     75 | A         |
| 2025-01-06 |    125 | B         |
| 2025-01-07 |    100 | A         |
| 2025-01-08 |     50 | B         |
| 2025-01-09 |     30 | A         |

**Summary:** From the table, it is evident that Product "A" had consistently higher sales compared to Product "B". Between January 1st and 9th, Sales for product "A" ranged from 75 to 200, while for product "B", the sales range was from 30 to 150. Additionally, there is a noticeable pattern of increase in sales for both products on certain days; for instance, sales for Product "A" increased from 100 to 200 on January 1st and 2nd, and for Product


In [None]:
# import pandas as pd
# from ollama import Ollama

# # Step 1: Create a sample pandas DataFrame
# df = pd.DataFrame({
#     'Product': ['A', 'B', 'C'],
#     'Sales': [100, 150, 90],
#     'Month': ['June', 'June', 'June']
# })

# # Step 2: Convert DataFrame to markdown table
# table_text = df.to_markdown(index=False)

# # Step 3: Create a prompt for summarization
# prompt = f"""
# You are a helpful data analyst.

# Given the following table, summarize the key insights, trends, or anomalies:

# {table_text}
# """

# # Step 4: Use Ollama Python client to send prompt to Mistral
# client = Ollama()
# response = client.chat(model='mistral', messages=[
#     {"role": "user", "content": prompt}
# ])

# # Step 5: Print the response
# print("\n🧠 Summary:")
# print(response['message']['content'])
