In [10]:
!curl -fsSL https://ollama.com/install.sh | sh


>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [14]:
get_ipython().system_raw('ollama serve > output.log 2>&1 &')

In [15]:
import time
time.sleep(5) # give ollama a moment to start up

I've started the `ollama` server in the background. Now you can try your `ollama` commands.

In [16]:
!ollama pull llama3.1

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2

In [17]:
!ollama run llama3.1 "Say hi in JSON: {\"msg\": \"hi\"}"

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h

In [18]:
import json
import pandas as pd
from tqdm import tqdm
import subprocess


In [40]:
def run_llm(prompt):
    try:
        result = subprocess.run(
            ["ollama", "run", "llama3.1"],
            input=prompt,
            text=True,
            capture_output=True
        )

        raw = result.stdout.strip()

        # remove markdown
        if raw.startswith("```"):
            raw = raw.split("```")[1].strip()

        try:
            return json.loads(raw), True
        except:
            return None, False
    except Exception as e:
        print("ERROR:", e)
        return None, False


In [41]:
PROMPT_V1 = """
Classify this Yelp review into a star rating 1–5.
Return ONLY valid JSON:

{{
  "predicted_stars": <1-5 integer>,
  "explanation": "<short reasoning>"
}}

Review:
"{review_text}"
"""


In [42]:
PROMPT_V2 = """
You are a sentiment classifier.

Requirements:
- Predict a rating from 1–5
- Provide a brief explanation
- Output VALID JSON only

JSON Schema:
{{
  "predicted_stars": integer,
  "explanation": string
}}

Review:
"{review_text}"
"""


In [43]:
PROMPT_V3 = """
You must output ONLY valid JSON.

Steps (do NOT output):
1. Analyze sentiment deeply.
2. Predict rating from 1–5.
3. Verify integer correctness.
4. Output ONLY JSON.

Final JSON:
{{
  "predicted_stars": <1-5 integer>,
  "explanation": "<brief reasoning>"
}}

Review:
"{review_text}"
"""


In [44]:
def evaluate_prompt(prompt_template, df, model="llama3.1"):
    results = []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_template.format(review_text=row["text"])
        data, valid = run_ollama(prompt, model=model)
        pred = data["predicted_stars"] if valid else None

        results.append({
            "actual": row["stars"],
            "predicted": pred,
            "json_valid": valid
        })

    return pd.DataFrame(results)


In [45]:
df = pd.read_csv("/content/yelp.csv")[["text", "stars"]].dropna()
df_sample = df.sample(200).reset_index(drop=True)  # use 50 for speed


In [46]:
def evaluate_prompt(prompt_template, df, model="llama3.1"):
    results = []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_template.format(review_text=row["text"])
        data, valid = run_llm(prompt) # Corrected from run_ollama
        pred = data["predicted_stars"] if valid else None

        results.append({
            "actual": row["stars"],
            "predicted": pred,
            "json_valid": valid
        })

    return pd.DataFrame(results)

df_v1 = evaluate_prompt(PROMPT_V1, df_sample, model="llama3.1")
df_v2 = evaluate_prompt(PROMPT_V2, df_sample, model="llama3.1")
df_v3 = evaluate_prompt(PROMPT_V3, df_sample, model="llama3.1")

100%|██████████| 200/200 [06:49<00:00,  2.05s/it]
100%|██████████| 200/200 [08:20<00:00,  2.50s/it]
100%|██████████| 200/200 [06:17<00:00,  1.89s/it]


In [47]:
valid = df_results[df_results["predicted"].notnull()]
accuracy = (valid["predicted"] == valid["actual"]).mean()
json_valid = df_results["json_valid"].mean()

accuracy, json_valid


(np.float64(0.6666666666666666), np.float64(0.96))

In [48]:
def compute_metrics(df):
    valid = df[df["predicted"].notnull()]
    accuracy = (valid["actual"] == valid["predicted"]).mean()
    json_valid = df["json_valid"].mean()
    return accuracy, json_valid

In [49]:
metrics = {
    "Prompt V1": compute_metrics(df_v1),
    "Prompt V2": compute_metrics(df_v2),
    "Prompt V3": compute_metrics(df_v3),
}

pd.DataFrame(metrics, index=["Accuracy", "JSON_Validity"]).T

Unnamed: 0,Accuracy,JSON_Validity
Prompt V1,0.621469,0.885
Prompt V2,0.587912,0.91
Prompt V3,0.603015,0.995


In [50]:
def reliability(df):
    return df["predicted"].std(), df["json_valid"].sum()

pd.DataFrame({
    "Prompt V1": reliability(df_v1),
    "Prompt V2": reliability(df_v2),
    "Prompt V3": reliability(df_v3),
}, index=["STD", "Valid_JSON_Count"]).T

Unnamed: 0,STD,Valid_JSON_Count
Prompt V1,1.334569,177.0
Prompt V2,1.263588,182.0
Prompt V3,1.250978,199.0
