In [1]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1


In [2]:
!pip install transformers peft accelerate huggingface_hub torc sentencepiece pandas -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m63.3 MB/s[0m eta [

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizerFast, GenerationConfig
from peft import PeftModel
import torch
import pandas as pd
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*max_new_tokens.*max_length.*")

# =========================
# Step 1: Load Model + Tokenizer
# =========================
base_model = "meta-llama/Meta-Llama-3-8B"
peft_model = "FinGPT/fingpt-mt_llama3-8b_lora"

tokenizer = LlamaTokenizerFast.from_pretrained(base_model, legacy=False)
tokenizer.pad_token = tokenizer.eos_token

model = LlamaForCausalLM.from_pretrained(base_model, device_map="cuda:0")
model = PeftModel.from_pretrained(model, peft_model)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# =========================
# Step 2: Load Data
# =========================
# Replace with your CSV path
csv_path = "customer_uncoded_test.csv"
df = pd.read_csv(csv_path)

# =========================
# Step 3: Create Table Analysis Prompt
# =========================
def get_table_summary_prompt(df):
    # Get basic statistics
    num_rows = len(df)
    num_cols = len(df.columns)
    column_names = ", ".join(df.columns)

    # Get data types and unique values for each column
    data_types = df.dtypes.astype(str).to_dict()
    unique_counts = df.nunique().to_dict()

    prompt = f"""### Instruction:
Analyze this table and provide a comprehensive understanding of its purpose and content. The table has:
- {num_rows} rows and {num_cols} columns
- Columns: {column_names}

Data characteristics:
{chr(10).join([f"- {col}: {dtype} type, {unique_counts[col]} unique values" for col, dtype in data_types.items()])}

Based on the column names and data types, explain:
1. What kind of data this table contains (e.g., customer data, transaction data, etc.)
2. The business context and purpose of this table
3. How the columns relate to each other
4. Any notable patterns or insights about the data structure

### Analysis:"""
    return prompt

# =========================
# Step 4: Create Column Analysis Prompts
# =========================
def get_column_prompts(df):
    prompts = []
    for col in df.columns:
        # Get data type and basic statistics
        dtype = df[col].dtype
        unique_count = df[col].nunique()
        null_count = df[col].isnull().sum()

        # Get value distribution for categorical columns
        value_dist = ""
        if pd.api.types.is_string_dtype(df[col]) and unique_count <= 10:
            value_dist = f"\nValue distribution: {df[col].value_counts().to_dict()}"

        prompt = f"""### Instruction:
Analyze the column "{col}" with the following characteristics:
- Data type: {dtype}
- Unique values: {unique_count}
- Null values: {null_count}{value_dist}

Provide a detailed analysis of:
1. What this column represents in the context of the table
2. The significance and purpose of this column
3. The type of data it contains and any patterns
4. How it relates to other columns in the table

### Analysis:"""
        prompts.append(prompt)
    return prompts

# =========================
# Step 5: Generate Analysis
# =========================
generation_config = GenerationConfig(
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.1,
    do_sample=True,
    max_new_tokens=500,
    pad_token_id=tokenizer.eos_token_id
)

def generate_analysis(prompt):
    tokens = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=tokens['input_ids'],
            attention_mask=tokens['attention_mask'],
            generation_config=generation_config
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# =========================
# Step 6: Print Results
# =========================
print("📊 Table Analysis:")
table_prompt = get_table_summary_prompt(df)
table_analysis = generate_analysis(table_prompt)
print(table_analysis.split("### Analysis:")[-1].strip())
print("\n" + "="*80 + "\n")

print("🔍 Column-wise Analysis:")
column_prompts = get_column_prompts(df)
for col, prompt in zip(df.columns, column_prompts):
    print(f"\nColumn: {col}")
    analysis = generate_analysis(prompt)
    explanation = analysis.split("### Analysis:")[-1].strip()
    print(explanation)
    print("-"*40)