In [1]:
import re
import gzip
import json
import random
import pandas as pd

In [6]:
file_path = '/Users/karangautam/Desktop/ppro/amazon/Electronics.json'
output_path = '/Users/karangautam/Desktop/ppro/amazon/Electronics_extracted2.csv'
samples_per_class = 50000

In [3]:
keep = ['reviewText', 'summary', 'overall']

In [4]:
def get_sentiment_label(rating: float) -> str:
    if rating <= 2.0:
        return "Negative"
    elif rating == 3.0:
        return "Neutral"
    else:
        return "Positive"

In [5]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)              # HTML tag removal
    text = re.sub(r'http\S+|www\S+', ' ', text)     # link removal
    text = re.sub(r'\s+', ' ', text)                # extra whitespace removal

    return text.strip()

Stratified Sampling

In [7]:
def extract_sentiment_dataset(
    input_path: str,
    output_path: str,
    total_samples: int = 300_000,
    seed: int = 42
):
    rng = random.Random(seed)

    # Target distribution
    target_ratio = {
        "Positive": 0.75,
        "Neutral":  0.10,
        "Negative": 0.15
    }

    target_per_class = {
        label: int(total_samples * ratio)
        for label, ratio in target_ratio.items()
    }

    reservoirs = {k: [] for k in target_per_class}
    seen_counts = {k: 0 for k in target_per_class}

    open_fn = gzip.open if input_path.endswith(".gz") else open

    with open_fn(input_path, "rt", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):

            if i % 1_000_000 == 0:
                print(f"Scanned {i:,} reviews...")

            try:
                review = json.loads(line)
            except json.JSONDecodeError:
                continue

            rating = review.get("overall")
            if rating is None:
                continue

            sentiment = get_sentiment_label(float(rating))
            if sentiment not in target_per_class:
                continue

            raw_text = f"{review.get('summary', '')} {review.get('reviewText', '')}"
            text = clean_text(raw_text)

            if len(text) < 5:
                continue

            seen_counts[sentiment] += 1

            record = {
                "review_text": text,
                "sentiment": sentiment,
                "rating": float(rating)
            }

            # Reservoir sampling per class
            if len(reservoirs[sentiment]) < target_per_class[sentiment]:
                reservoirs[sentiment].append(record)
            else:
                j = rng.randint(0, seen_counts[sentiment] - 1)
                if j < target_per_class[sentiment]:
                    reservoirs[sentiment][j] = record

            # if all targets met
            if all(len(reservoirs[k]) >= target_per_class[k] for k in target_per_class):
                print(f"All targets met at {i:,} reviews")
                break

    
    rows = []
    for records in reservoirs.values():
        rows.extend(records)

    df = pd.DataFrame(rows)
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df.to_csv(output_path, index=False)

    print('size : ',df.shape)
    print("Class distribution:")
    print(df["sentiment"].value_counts(normalize=True) * 100)


In [8]:
extract_sentiment_dataset(
    input_path=file_path,
    output_path=output_path,
    total_samples=350_000,
    seed=42
)

All targets met at 491,616 reviews
size :  (350000, 3)
Class distribution:
sentiment
Positive    75.0
Negative    15.0
Neutral     10.0
Name: proportion, dtype: float64


Seed + reservoir
* Chooses data fairly
* Works on very big files
* Gives every review a chance
* Gives the same result every time