In [None]:
import os
import glob
import json
import pandas as pd

# 1) Input directories
ai_dir      = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Ai_Genuine_Reviews/CreaetAiReviews/DataSet"
genuine_dir = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Review_Score/Dataset"

# 2) Gather file paths
ai_files      = glob.glob(os.path.join(ai_dir,      "*.json*"))
genuine_files = glob.glob(os.path.join(genuine_dir, "*.json*"))

# 3) Loader for JSON / JSONL with nested "root"
def robust_load(fp):
    with open(fp, "r", encoding="utf-8") as f:
        txt = f.read().strip()
    if not txt:
        return []
    try:
        return json.loads(txt)
    except json.JSONDecodeError:
        return [json.loads(line) for line in txt.splitlines() if line.strip()]

# 4) Load ALL AI reviews
ai_records = []
for fp in ai_files:
    items = robust_load(fp)
    if isinstance(items, dict) and "root" in items:
        items = items["root"] if isinstance(items["root"], list) else [items["root"]]
    if isinstance(items, dict):
        items = [items]
    for rec in items:
        text = rec.get("review") or rec.get("text") or ""
        text = text.strip()
        if text:
            ai_records.append({"review": text, "source": "ai"})

df_ai = pd.DataFrame(ai_records).drop_duplicates(subset="review")

# 5) Load genuine USA reviews
gen_records = []
for fp in genuine_files:
    items = robust_load(fp)
    if isinstance(items, dict) and "root" in items:
        items = items["root"] if isinstance(items["root"], list) else [items["root"]]
    if isinstance(items, dict):
        items = [items]
    for rec in items:
        text = rec.get("text") or rec.get("review") or ""
        text = text.strip()
        if text:
            gen_records.append({"review": text, "source": "genuine"})

df_gen = pd.DataFrame(gen_records).drop_duplicates(subset="review")

# 6) Filter genuine for “complex” only
df_gen["words"]             = df_gen["review"].str.split()
df_gen["word_count"]        = df_gen["words"].apply(len)
df_gen["unique_word_ratio"] = df_gen["words"].apply(lambda w: len(set(w))/len(w) if w else 0.0)

MIN_WORDS = 50
MIN_RATIO = 0.5
df_gen_complex = df_gen[
    (df_gen["word_count"]        >= MIN_WORDS) &
    (df_gen["unique_word_ratio"] >  MIN_RATIO)
].copy()

# 7) Match genuine count to AI count
n_ai = len(df_ai)
available = len(df_gen_complex)
if available < n_ai:
    raise ValueError(
        f"Not enough complex genuine reviews ({available}) to match {n_ai} AI reviews."
    )

df_gen_matched = df_gen_complex.sample(n=n_ai, random_state=42)

# 8) Combine
df_final = pd.concat([df_ai, df_gen_matched], ignore_index=True)
df_final = df_final.drop_duplicates(subset="review")  # optional across-source dedupe

# 9) Save
out_csv = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Ai_Genuine_Reviews/CombineReviews/Dataset/ai_and_matched_complex_genuine.csv"
df_final.to_csv(out_csv, index=False)
print(f"Saved {len(df_final)} total reviews ({n_ai} AI + {n_ai/} genuine) to {out_csv}")


Saved 99812 total reviews (49906 AI + 49906 genuine) to /kaggle/working/ai_and_matched_complex_genuine.csv
