In [1]:
import json
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Load data
INPUT_FILE = "Data.json"
TRAIN_FILE = "train.json"
TEST_FILE = "test.json"
TEST_SIZE = 0.2  # 20% for testing

# Step 1: Read and group reviews by rating
data_by_rating = defaultdict(list)

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            review = json.loads(line.strip())
            rating = int(review.get("overall", 0))
            data_by_rating[rating].append(review)
        except json.JSONDecodeError:
            continue

# Step 2: Stratified splitting
train_data = []
test_data = []

for rating, reviews in data_by_rating.items():
    if len(reviews) >= 2:  # at least 2 samples to split
        train, test = train_test_split(reviews, test_size=TEST_SIZE, random_state=42)
        train_data.extend(train)
        test_data.extend(test)
    else:
        # Not enough samples to split
        train_data.extend(reviews)

# Step 3: Save to JSON line files
with open(TRAIN_FILE, 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

with open(TEST_FILE, 'w', encoding='utf-8') as f:
    for item in test_data:
        f.write(json.dumps(item) + "\n")

print(f"✅ Train size: {len(train_data)} | Test size: {len(test_data)}")


✅ Train size: 8207 | Test size: 2054
