In [1]:
import os
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# List of all your files
review_files = [
    "negative books.review",
    "positive books.review",
    "negative dvd.review",
    "positive dvd.review",
    "negative elec.review",
    "positive elec.review",
    "negative kt.review",
    "positive kt.review",
]

def infer_label_from_filename(filename: str) -> int:
    """
    positive = 1
    negative = 0
    """
    fname = filename.lower()
    if "positive" in fname:
        return 1
    elif "negative" in fname:
        return 0
    else:
        raise ValueError(f"Cannot infer label from filename: {filename}")

def infer_domain_from_filename(filename: str) -> str:
    """
    books, dvd, elec, kt (normalised domain)
    """
    fname = filename.lower()
    # split "negative books.review" to just "negative", "books.review"
    parts = fname.split()
    if len(parts) < 2:
        return "unknown"
    dom_part = parts[1]          # Splitting "books.review"
    dom_raw = dom_part.split(".")[0]  # to  "books"
    
    if dom_raw == "books":
        return "books"
    elif dom_raw == "dvd":
        return "dvd"
    elif dom_raw == "elec":
        return "electronics"
    elif dom_raw == "kt":
        return "kitchen"
    else:
        return dom_raw
#Files into dataframe

def extract_tag(block: str, tag: str):
    """
    Extracts the content inside from a review block.
    Returns None if tag is missing.
    """
    pattern = rf"<{tag}>\s*(.*?)\s*</{tag}>" #Applying Regex for content inside "tag"
    match = re.search(pattern, block, flags=re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else None

def parse_review_file(path: str, filename_for_meta: str):
    """
    Parse ONE .review file and return list of dicts:
    text, label, domain, rating
    """
    label = infer_label_from_filename(filename_for_meta)
    domain = infer_domain_from_filename(filename_for_meta)

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()
    
    chunks = raw.split("<review>")
    reviews = []
    
    for chunk in chunks:
        if "</review>" not in chunk:
            continue
        block = chunk.split("</review>")[0]
        
        review_text = extract_tag(block, "review_text")
        title = extract_tag(block, "title")
        rating = extract_tag(block, "rating")
        
        # skip if missing main text
        if not review_text or review_text.strip() == "":
            continue
        
        # optional: prepend title
        if title:
            full_text = f"{title}. {review_text}"
        else:
            full_text = review_text
        
        # try numeric rating
        try:
            rating_val = float(rating) if rating is not None else None
        except:
            rating_val = None
        
        reviews.append({
            "text": full_text.strip(),
            "label": label,
            "domain": domain,
            "rating": rating_val,
        })
    return reviews

all_reviews = []

for fname in review_files:
    path = fname  
    print(f"Parsing: {path}")
    parsed = parse_review_file(path, fname)
    print(f"  = {len(parsed)} reviews")
    all_reviews.extend(parsed)

df = pd.DataFrame(all_reviews)
print("\nData sample:")
df.head()
print("\nCounts by domain and label:")
print(df.groupby(["domain", "label"]).size())
print("\nTotal samples:", len(df))

# Data cleaning 
def clean_text(text: str) -> str:
    text = str(text).lower()
    # remove HTML as it is done with tags 
    text = re.sub(r"<.*?>", " ", text)
    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    # remove digits
    text = re.sub(r"\d+", " ", text)
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)
df["token_count"] = df["clean_text"].apply(lambda x: len(x.split()))

#Checking if the number of reviews changes after removing short reviews 
print("Before removing short reviews:", len(df))
# remove very short reviews (< 3 tokens)
df = df[df["token_count"] >= 3].copy()
df.reset_index(drop=True, inplace=True)
print("After removing short reviews:", len(df))

df[["clean_text", "label", "domain"]].head()

#Splitting 

max_words = 20000   # vocab size
max_len = 200       # max sequence length
train_ratio = 0.60
val_ratio = 0.20
test_ratio = 0.20
random_state = 42

# train, validation and test splitting 
texts = df["clean_text"].values
labels = df["label"].values

# 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    texts,
    labels,
    test_size=test_ratio,
    random_state=random_state,
    stratify=labels
)

# Validation
val_rel_size = val_ratio / (1.0 - test_ratio)  # 0.20 / 0.80 = 0.25 (We are just using 20% for validation)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=val_rel_size,
    random_state=random_state,
    stratify=y_temp
)

print("\nFinal split sizes:")
print("Train:", len(X_train))
print("Validation:", len(X_val))
print("Test:", len(X_test))

Parsing: negative books.review
  = 1000 reviews
Parsing: positive books.review
  = 1000 reviews
Parsing: negative dvd.review
  = 1000 reviews
Parsing: positive dvd.review
  = 1000 reviews
Parsing: negative elec.review
  = 1000 reviews
Parsing: positive elec.review
  = 1000 reviews
Parsing: negative kt.review
  = 1000 reviews
Parsing: positive kt.review
  = 1000 reviews

Data sample:

Counts by domain and label:
domain       label
books        0        1000
             1        1000
dvd          0        1000
             1        1000
electronics  0        1000
             1        1000
kitchen      0        1000
             1        1000
dtype: int64

Total samples: 8000
Before removing short reviews: 8000
After removing short reviews: 8000

Final split sizes:
Train: 4800
Validation: 1600
Test: 1600
