<a href="https://colab.research.google.com/github/inbalhasar/NLP_project/blob/main/gemini_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.makedirs('/content/drive/MyDrive/gemini_runs', exist_ok=True)

In [None]:
import os
os.environ["GEMINI_API_KEY"] = "AIzaSyAUjvCwbwUS78Lw59OCUHWNiU9v44PCUfw"

In [None]:
# ==== AI/Human full-dataset runner (local save, no Drive) ====
import os
import json
import time
import re
from datetime import datetime
import pandas as pd
from google import genai

# --------- EDIT THESE IF NEEDED ----------
# Your CSV path in Colab. If you uploaded the file via the Files panel or files.upload(), /content is correct.
PATH = "/content/ai_human_content_detection_dataset.csv"

# Column names in your CSV
TEXT_COL  = "text_content"   # text column
LABEL_COL = "label"          # 1 = AI, 0 = Human

# Model to use
MODEL_NAME = "gemini-2.0-flash"   # or "gemini-2.5-flash"

PROGRESS_CSV_PATH = "/content/drive/MyDrive/gemini_runs/predictions_progress.csv"
REQUEST_LOG_PATH  = "/content/drive/MyDrive/gemini_runs/gemini_request_log_8.json"


# Free-tier daily cap (adjust if Google changes limits)
DAILY_LIMIT = 200
# -----------------------------------------

# If you prefer to pass the key explicitly, uncomment the next line:
# client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

def load_request_log(path=REQUEST_LOG_PATH):
    """Load (and reset if date changed) the daily request counter (UTC)."""
    today = datetime.utcnow().strftime("%Y-%m-%d")
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception:
            data = {}
    else:
        data = {}
    if data.get("date") != today:
        data = {"date": today, "count": 0}
    return data

def save_request_log(data, path=REQUEST_LOG_PATH):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_progress(path=PROGRESS_CSV_PATH):
    """Read prior predictions if present; otherwise return an empty frame."""
    if os.path.exists(path):
        return pd.read_csv(path)
    cols = ["row_id", "pred_label", "raw_response", "explanation", "model_name", "timestamp_utc"]
    return pd.DataFrame(columns=cols)

def save_progress(df_prog, path=PROGRESS_CSV_PATH):
    df_prog.to_csv(path, index=False)

def parse_decision(resp_text: str):
    """
    Extract 'AI' or 'Human' from the model response.
    Expects the model to put the label on the first line.
    """
    if not resp_text:
        return None
    lines = [ln.strip() for ln in resp_text.strip().splitlines() if ln.strip()]
    if lines:
        first = lines[0].lower()
        if re.search(r"\bai\b", first):
            return "AI"
        if re.search(r"\bhuman\b", first):
            return "Human"
    # Fallbacks
    lower = resp_text.lower()
    if re.search(r"(label|prediction|answer)\s*[:\-]\s*ai\b", lower):
        return "AI"
    if re.search(r"(label|prediction|answer)\s*[:\-]\s*human\b", lower):
        return "Human"
    if re.search(r"\bai\b", lower):
        return "AI"
    if re.search(r"\bhuman\b", lower):
        return "Human"
    return None

def decision_to_int(decision: str):
    """Map 'AI' -> 1, 'Human' -> 0, else None."""
    if decision is None:
        return None
    d = decision.strip().lower()
    return 1 if d == "ai" else 0 if d == "human" else None

def call_gemini_with_backoff(client, prompt, model=MODEL_NAME, max_retries=5):
    """Retry on rate-limit/quota errors with exponential backoff."""
    delay = 2.0
    for attempt in range(max_retries):
        try:
            return client.models.generate_content(model=model, contents=prompt)
        except Exception as e:
            m = str(e).lower()
            if any(k in m for k in ["rate", "quota", "429", "too many requests", "exceed"]):
                if attempt == max_retries - 1:
                    raise
                time.sleep(delay)
                delay = min(delay * 2, 60)
            else:
                raise

def main():
    # 1) Load dataset
    df = pd.read_csv(PATH)
    if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
        raise ValueError(f"Columns '{TEXT_COL}' and/or '{LABEL_COL}' not found in CSV.")
    df = df[[TEXT_COL, LABEL_COL]].dropna().reset_index().rename(columns={"index": "row_id"})

    # 2) Load progress and daily counter
    progress = load_progress()
    req_log = load_request_log()

    done_ids = set(progress["row_id"].astype(int)) if not progress.empty else set()
    pending = df[~df["row_id"].isin(done_ids)].copy()

    remaining = max(0, DAILY_LIMIT - int(req_log.get("count", 0)))
    if remaining == 0:
        print(f"Daily quota reached ({DAILY_LIMIT}). Try again tomorrow.")
        merged = df.merge(progress[["row_id", "pred_label"]], on="row_id", how="left")
        evaluated = merged.dropna(subset=["pred_label"]).copy()
        if not evaluated.empty:
            evaluated["correct"] = (evaluated["pred_label"].astype(int) == evaluated[LABEL_COL].astype(int))
            print(f"Total evaluated so far: {len(evaluated)} | Correct: {evaluated['correct'].sum()} "
                  f"| Accuracy: {evaluated['correct'].mean():.3f}")
        return

    if pending.empty:
        print("No pending rows. Everything is already evaluated.")
        merged = df.merge(progress[["row_id", "pred_label"]], on="row_id", how="left")
        evaluated = merged.dropna(subset=["pred_label"]).copy()
        if not evaluated.empty:
            evaluated["correct"] = (evaluated["pred_label"].astype(int) == evaluated[LABEL_COL].astype(int))
            print(f"Total evaluated: {len(evaluated)} | Correct: {evaluated['correct'].sum()} "
                  f"| Accuracy: {evaluated['correct'].mean():.3f}")
        return

    to_run_now = min(remaining, len(pending))
    work_batch = pending.head(to_run_now).copy()
    print(f"Pending rows: {len(pending)} | Running now: {to_run_now} | Remaining daily quota before run: {remaining}")

    client = genai.Client()

    new_rows = []
    done_this_run = 0

    for _, row in work_batch.iterrows():
        row_id = int(row["row_id"])
        text = str(row[TEXT_COL])

        prompt = (
            "Decide if the following text was likely written by an AI or a human.\n"
            "FIRST line: output ONLY one word — 'AI' or 'Human'.\n"
            "SECOND line: a short explanation.\n\n"
            f"{text}"
        )

        try:
            resp = call_gemini_with_backoff(client, prompt, model=MODEL_NAME)
            resp_text = getattr(resp, "text", None)
        except Exception:
            resp_text = None

        decision = parse_decision(resp_text or "")
        pred_int = decision_to_int(decision)

        # Extract a brief explanation (2nd line if present), else the whole text
        explanation = None
        if resp_text:
            lines = resp_text.strip().splitlines()
            explanation = lines[1].strip() if len(lines) >= 2 else resp_text.strip()

        new_rows.append({
            "row_id": row_id,
            "pred_label": pred_int,
            "raw_response": resp_text,
            "explanation": explanation,
            "model_name": MODEL_NAME,
            "timestamp_utc": datetime.utcnow().isoformat(timespec="seconds")
        })

        # Update counter and periodically flush to disk
        req_log["count"] = int(req_log.get("count", 0)) + 1
        done_this_run += 1

        if done_this_run % 10 == 0:
            progress = pd.concat([progress, pd.DataFrame(new_rows)], ignore_index=True)
            save_progress(progress, PROGRESS_CSV_PATH)
            save_request_log(req_log, REQUEST_LOG_PATH)
            new_rows = []

        if req_log["count"] >= DAILY_LIMIT:
            print(f"Reached daily limit ({DAILY_LIMIT}). Stopping.")
            break

    if new_rows:
        progress = pd.concat([progress, pd.DataFrame(new_rows)], ignore_index=True)
    save_progress(progress, PROGRESS_CSV_PATH)
    save_request_log(req_log, REQUEST_LOG_PATH)

    # Accuracy reports
    merged_all = df.merge(progress[["row_id", "pred_label"]], on="row_id", how="left")
    evaluated_all = merged_all.dropna(subset=["pred_label"]).copy()
    if not evaluated_all.empty:
        evaluated_all["correct"] = (evaluated_all["pred_label"].astype(int) == evaluated_all[LABEL_COL].astype(int))
        total_correct = int(evaluated_all["correct"].sum())
        total_eval = len(evaluated_all)
        total_acc = total_correct / total_eval if total_eval else 0.0
        print(f"[Overall] Evaluated: {total_eval} | Correct: {total_correct} | Accuracy: {total_acc:.3f}")

    if done_this_run > 0:
        processed_ids_this_run = set(work_batch["row_id"].tolist()[:done_this_run])
        eval_this = evaluated_all[evaluated_all["row_id"].isin(processed_ids_this_run)].copy()
        if not eval_this.empty:
            correct_this = int(eval_this["correct"].sum())
            acc_this = correct_this / len(eval_this)
            print(f"[This run] Evaluated: {len(eval_this)} | Correct: {correct_this} | Accuracy: {acc_this:.3f}")

if __name__ == "__main__":
    main()


  today = datetime.utcnow().strftime("%Y-%m-%d")


Pending rows: 167 | Running now: 167 | Remaining daily quota before run: 200


  "timestamp_utc": datetime.utcnow().isoformat(timespec="seconds")


[Overall] Evaluated: 1303 | Correct: 655 | Accuracy: 0.503
[This run] Evaluated: 158 | Correct: 2 | Accuracy: 0.013
