In [55]:
import os, types, json, time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from botocore.client import Config
import ibm_boto3
from tqdm.auto import tqdm
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models import Model

API_KEY    = os.getenv("IBM_CLOUD_APIKEY", "VZFvE0ufR4ePXpSobgdtuBLMCQmBjxVuhnUD7aYC4osl")
PROJECT_ID = os.getenv("PROJECT_ID", "a100f552-8d74-4455-a7cb-bee0dba6ffd2")
WML_URL    = os.getenv("WML_URL",   "https://us-south.ml.cloud.ibm.com")

COS_ENDPOINT   = "https://s3.us-south.cloud-object-storage.appdomain.cloud"
COS_BUCKET     = "sentiment-analysislegal"
COS_OBJECT_KEY = "legal sentiment analyzer.csv"

LABEL_MAP      = {-1: "negative", 0: "neutral", 1: "positive"}
BATCH_SIZE     = 1         # ⚠ For Lite plan: 1 request per second
MAX_RETRIES    = 5

In [56]:
# %% --------------- 3. CONNECT TO OBJECT STORAGE ---------
cos = ibm_boto3.client(
    service_name="s3",
    ibm_api_key_id=API_KEY,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version="oauth"),
    endpoint_url=COS_ENDPOINT,
)

In [57]:
print("⏳ Loading dataset from COS …")
body = cos.get_object(Bucket=COS_BUCKET, Key=COS_OBJECT_KEY)["Body"]
if not hasattr(body, "iter"):
    body.iter = types.MethodType(iter, body)

df = pd.read_csv(body).dropna(subset=["Phrase", "Sentiment"])
df["Sentiment"] = df["Sentiment"].astype(int)
print(f"✅ Loaded {len(df)} records.")

⏳ Loading dataset from COS …
✅ Loaded 500 records.


In [58]:
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df["Sentiment"])

In [59]:
def build_few_shot_prompt(df_samples, n_examples: int = 6):
    shots = []
    for _, row in df_samples.sample(n_examples, random_state=42).iterrows():
        shots.append(f"sentence: {row['Phrase']}\nsentiment: {LABEL_MAP[row['Sentiment']]}")
    return (
        "Classify the sentiment of each legal sentence. Possible labels: positive, negative, neutral. "
        "Return ONLY the label. Examples:\n" + "\n--\n".join(shots) + "\n----\n"
    )

prompt_base = build_few_shot_prompt(train_df)

In [60]:
MODEL_ID = "google/flan-t5-xxl"
params = {
    GenParams.DECODING_METHOD: "greedy",
    GenParams.TEMPERATURE: 0.0,
    GenParams.MAX_NEW_TOKENS: 3,
    GenParams.MIN_NEW_TOKENS: 1,
}
model = Model(model_id=MODEL_ID, params=params, credentials={"url": WML_URL, "apikey": API_KEY}, project_id=PROJECT_ID)
print("🤖 Model ready:", MODEL_ID)

🤖 Model ready: google/flan-t5-xxl




In [None]:
def classify(sentences, prompt):
    preds = []
    for sentence in tqdm(sentences, desc="Classifying", unit="sample"):
        full_prompt = prompt + sentence
        attempt = 0
        while attempt < MAX_RETRIES:
            try:
                res = model.generate(full_prompt)["results"]
                label = res[0]["generated_text"].strip().lower().split()[0]
                preds.append(label)
                time.sleep(1.1)  # Lite plan rate limit: 1 request/sec
                break
            except Exception as e:
                attempt += 1
                wait_time = 2 ** attempt
                print(f"⚠️  Retry {attempt} in {wait_time}s due to error: {str(e)[:100]}")
                time.sleep(wait_time)
        else:
            preds.append("error")
    return preds

print("🚀 Running inference on", len(test_df), "samples …")
true_labels = [LABEL_MAP[x] for x in test_df["Sentiment"].tolist()]
pred_labels = classify(test_df["Phrase"].tolist(), prompt_base)

🚀 Running inference on 125 samples …


Classifying:   0%|          | 0/125 [00:00<?, ?sample/s]

In [None]:

print("\n📈 Evaluation metrics:")
print(classification_report(true_labels, pred_labels, digits=4))
print("Accuracy:", accuracy_score(true_labels, pred_labels))


In [None]:
# %% --------------- SAVE PREDICTIONS ----------------
print("💾 Uploading predictions …")
output_key = "predictions/legal_sentiment_predictions.csv"
export_df = test_df.copy()
export_df["Predicted"] = pred_labels
cos.put_object(Bucket=COS_BUCKET, Key=output_key, Body=export_df.to_csv(index=False).encode())
print(f"✅ File written: s3://{COS_BUCKET}/{output_key}")


In [None]:
# %% ---------------  QUICK SUMMARY -------------------
summary_prompt = (
    "Summarize the sentiment distribution and trends in one paragraph:\n" +
    json.dumps(export_df[["Phrase", "Predicted"]].head(20).to_dict("records"))
)
summary = model.generate(summary_prompt)["results"][0]["generated_text"].strip()
print("\n📝 Summary:\n", summary)

In [None]:
print("🏁 Finished.")
