In [79]:
import pandas as pd
import torch
import re
import shutil
from transformers import AutoTokenizer, AutoModelForCausalLM

In [80]:
file1 = "Employee_Report.xlsx"
file2 = None

# Backup original
shutil.copy(file1, "backup_" + file1)

df1 = pd.read_excel(file1)

if file2:
    df2 = pd.read_excel(file2)
else:
    df2 = None

print("File Loaded ✅")
print("Columns:", df1.columns.tolist())


File Loaded ✅
Columns: ['Employee_ID', 'Employee_Name', 'Department', 'Salary', 'Profit']


In [81]:
# Clean column names
df1.columns = df1.columns.str.strip()

# Clean string values
for col in df1.select_dtypes(include=["object"]).columns:
    df1[col] = df1[col].astype(str).str.strip()

# Convert int columns to float (prevents dtype errors)
for col in df1.select_dtypes(include=["int64"]).columns:
    df1[col] = df1[col].astype(float)

print("Data Cleaning Completed ✅")


Data Cleaning Completed ✅


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  for col in df1.select_dtypes(include=["object"]).columns:


In [82]:
model_name = "Qwen/Qwen2.5-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

print("SLM Loaded ✅")


Loading weights:   0%|          | 0/434 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


SLM Loaded ✅


In [94]:
question = input("Enter your Excel operation: ")

info_keywords = ["what is", "define", "explain", "meaning"]

if any(word in question.lower() for word in info_keywords):
    mode = "info"
else:
    mode = "excel"

print("Mode:", mode)


Enter your Excel operation:  Delete rows where Profit is empty.


Mode: excel


In [95]:
if mode == "excel":

    prompt = f"""
You are a pandas Excel automation engine.

STRICT RULES:
- Do NOT use import.
- Do NOT use read_excel.
- Do NOT redefine df1.
- df1 is already loaded.
- Only modify df1 directly.
- If updating use .loc.
- If deleting rows, you MUST use EXACTLY this format:
  df1.drop(index=df1[CONDITION].index, inplace=True)
- NEVER use .loc().drop()
- NEVER chain drop after loc
- Generate ONLY ONE line of executable Python code.
- No explanation.
- No markdown.

Columns:
{df1.columns.tolist()}

User Request:
{question}

Answer:
"""
else:
    prompt = question


In [96]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    temperature=0.1,
    do_sample=False
)

generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print("Raw Output:\n", response)


Raw Output:
 df1.drop(index=df1[df1['Profit'].isna()].index, inplace=True)


In [97]:
code = re.sub(r"```python", "", response)
code = re.sub(r"```", "", code)
code = code.strip()

# Remove duplicate lines
lines = list(dict.fromkeys(code.split("\n")))
code = "\n".join(lines)

print("\nFinal Clean Executable Code:\n")
print(code)



Final Clean Executable Code:

df1.drop(index=df1[df1['Profit'].isna()].index, inplace=True)


In [98]:
unsafe_keywords = ["import", "__", "os", "sys", "subprocess", "read_excel"]

if mode == "excel":

    if any(word in code for word in unsafe_keywords):
        print("Unsafe code blocked ❌")

    elif ".loc" in code and ".drop" in code:
        print("Invalid delete pattern blocked ❌")

    else:
        try:
            exec(code)

            df1.to_excel(file1, index=False)

            print("Excel Updated Successfully ✅")

        except Exception as e:
            print("Execution Error:", e)

else:
    print("\nInfo Response:\n")
    print(response)


Excel Updated Successfully ✅
