## Combine Datasets

In [3]:
import json
import random
from pathlib import Path
from tqdm import tqdm


# 或者使用 Path 对象创建路径
base_dir = Path(r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data")
file_paths = [
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Cell_Phones_and_Accessories.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Clothing_Shoes_and_Jewelry.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Electronics.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Home_and_Kitchen.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\part.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\separate.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Sports_and_Outdoors.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Toys_and_Games.json"
]

def extract_and_combine_datasets(file_paths, samples_per_file, output_file):
    """
    从多个JSON文件中提取样本并组合
    """
    combined_data = []
    
    # 处理每个文件
    for file_path in tqdm(file_paths, desc="Processing files"):
        samples = []
        try:
            # 使用 Path 对象处理路径
            file_path = Path(file_path)
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                
                if len(lines) <= samples_per_file:
                    selected_lines = lines
                else:
                    selected_lines = random.sample(lines, samples_per_file)
                
                for line in selected_lines:
                    try:
                        sample = json.loads(line.strip())
                        samples.append(sample)
                    except json.JSONDecodeError:
                        continue
                
                print(f"\nFile: {file_path.name}")
                print(f"Extracted samples: {len(samples)}")
                combined_data.extend(samples)
                
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue
    
    # 随机打乱并保存
    random.shuffle(combined_data)
    
    # 使用 Path 对象处理输出路径
    output_path = Path(output_file)
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in combined_data:
            f.write(json.dumps(item) + '\n')
    
    print(f"\nFinal combined dataset:")
    print(f"Total samples: {len(combined_data)}")
    print(f"Saved to: {output_path}")
    
    # 显示类别分布
    class_distribution = {}
    for item in combined_data:
        class_label = str(int(item.get('class', 0)))
        class_distribution[class_label] = class_distribution.get(class_label, 0) + 1
    
    print("\nClass distribution in combined dataset:")
    for class_label, count in class_distribution.items():
        print(f"Class {class_label}: {count} samples ({count/len(combined_data)*100:.2f}%)")

# 执行组合
# 使用 Path 对象创建输出路径
output_dir = base_dir
output_dir.mkdir(exist_ok=True)  # 创建输出目录（如果不存在）
output_file = output_dir / "validation_dataset.json"

extract_and_combine_datasets(
    file_paths=file_paths,
    samples_per_file=50,
    output_file=str(output_file)
)

# 验证
with open(output_file, 'r', encoding='utf-8') as f:
    combined_examples = [json.loads(line) for line in f]
print(f"\nVerification:")
print(f"Total examples in combined dataset: {len(combined_examples)}")

Processing files:  12%|█▎        | 1/8 [00:11<01:17, 11.07s/it]


File: Cell_Phones_and_Accessories.json
Extracted samples: 50


Processing files:  25%|██▌       | 2/8 [01:04<03:34, 35.73s/it]


File: Clothing_Shoes_and_Jewelry.json
Extracted samples: 50


Processing files:  38%|███▊      | 3/8 [01:39<02:56, 35.40s/it]


File: Electronics.json
Extracted samples: 50


Processing files:  50%|█████     | 4/8 [01:53<01:47, 26.93s/it]


File: Home_and_Kitchen.json
Extracted samples: 50


Processing files:  62%|██████▎   | 5/8 [01:54<00:52, 17.62s/it]


File: part.json
Extracted samples: 50


Processing files:  75%|███████▌  | 6/8 [01:57<00:25, 12.67s/it]


File: separate.json
Extracted samples: 0


Processing files:  88%|████████▊ | 7/8 [02:00<00:09,  9.75s/it]


File: Sports_and_Outdoors.json
Extracted samples: 50


Processing files: 100%|██████████| 8/8 [02:03<00:00, 15.48s/it]


File: Toys_and_Games.json
Extracted samples: 50

Final combined dataset:
Total samples: 350
Saved to: C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\validation_dataset.json

Class distribution in combined dataset:
Class 0: 111 samples (31.71%)
Class 1: 239 samples (68.29%)






Verification:
Total examples in combined dataset: 350


## Convert Combined Dataset to CSV

In [15]:
import json
import pandas as pd
from tqdm import tqdm

# 文件路径
json_path = r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\combined_dataset.json"

# 读取整个JSON文件
print("Reading JSON file...")
all_records = []
unique_texts = set()  # 用于检查重复

with open(json_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Processing records"):
        try:
            review = json.loads(line.strip())
            # 存储原始记录
            all_records.append(review)
            # 存储review text用于检查重复
            unique_texts.add(review.get('reviewText', ''))
        except json.JSONDecodeError:
            print("Found invalid JSON line")
            continue
        except Exception as e:
            print(f"Error processing line: {str(e)}")
            continue

# 打印基本统计信息
print("\nData Statistics:")
print(f"Total JSON records: {len(all_records)}")
print(f"Unique reviews: {len(unique_texts)}")

# 转换为DataFrame以便更详细的分析
df = pd.DataFrame(all_records)
print("\nDataFrame Info:")
print(df.info())

# 显示几个样本
print("\nSample records:")
print(df.head())

# 保存处理后的数据
df.to_csv(r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\test_dataset.csv", index=False)

Reading JSON file...


Processing records: 7000it [00:00, 109196.20it/s]


Data Statistics:
Total JSON records: 7000
Unique reviews: 6999

DataFrame Info:





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   _id                       7000 non-null   object 
 1   reviewerID                7000 non-null   object 
 2   asin                      7000 non-null   object 
 3   reviewerName              6959 non-null   object 
 4   helpful                   7000 non-null   object 
 5   reviewText                7000 non-null   object 
 6   overall                   7000 non-null   float64
 7   summary                   7000 non-null   object 
 8   unixReviewTime            7000 non-null   int64  
 9   reviewTime                7000 non-null   object 
 10  category                  7000 non-null   object 
 11  class                     6000 non-null   float64
 12  BehaviouralFeatureResult  1000 non-null   object 
 13  label                     205 non-null    float64
dtypes: float

## Filter Invalid Data in Original Datasets & Convert Format

In [26]:
import json
import os
from tqdm import tqdm


# List of JSON file paths
file_paths = [
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Cell_Phones_and_Accessories.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Clothing_Shoes_and_Jewelry.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Electronics.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Home_and_Kitchen.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\part.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\separate.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Sports_and_Outdoors.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\Toys_and_Games.json"
]

# Output directory for filtered datasets
output_directory = r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData"

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Process each file
for file_path in tqdm(file_paths, desc="Processing files"):
    data = []
    # Load JSON data line by line
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Processing files"):
            try:
                item = json.loads(line.strip())
                data.append(item)
            except json.JSONDecodeError:
                continue  # Skip invalid JSON lines

    # Filter and transform data
    training_data = []
    for example in data:
        try:
            if isinstance(example, dict) and 'reviewText' in example and 'class' in example:
                if isinstance(example['reviewText'], str) and isinstance(example['class'], (int, float)):
                    training_data.append({
                        'text_input': example['reviewText'],
                        'output': str(int(example['class']))  # Convert to string format
                    })
        except:
            continue  # Skip any example that causes an error

    # Save the filtered dataset
    output_file_name = os.path.basename(file_path).replace('.json', '_filtered.json')
    output_path = os.path.join(output_directory, output_file_name)
    with open(output_path, 'w', encoding='utf-8') as out_file:
        json.dump(training_data, out_file, indent=2, ensure_ascii=False)

output_directory


'C:\\Users\\Joyce\\OneDrive\\桌面\\24 Fall\\CS6220\\Project\\FilteredData'

## Combine Filtered Datasets

In [27]:
import json
import pandas as pd

file_paths = [
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\Cell_Phones_and_Accessories_filtered.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\Clothing_Shoes_and_Jewelry_filtered.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\Electronics_filtered.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\Home_and_Kitchen_filtered.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\part_filtered.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\separate_filtered.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\Sports_and_Outdoors_filtered.json",
    r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\Toys_and_Games_filtered.json"
]  

combined_data = []
sample_size = 1000

for file_path in file_paths:
    with open(file_path, 'r') as f:
        try:
            data = json.load(f)
            valid_samples = []
            for sample in data[:sample_size]:
                try:
                    valid_samples.append(sample)
                except json.JSONDecodeError:
                    print(f"Skipping a problematic sample in file {file_path}.")
            combined_data.extend(valid_samples)
        except json.JSONDecodeError:
            print(f"Entire file {file_path} could not be read, skipping this file.")


# Saving as JSON
combined_json_path = r'C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\combined_dataset.json'
with open(combined_json_path, 'w') as f:
    json.dump(combined_data, f, indent=2)

# Convert to DataFrame and save as CSV
combined_df = pd.DataFrame(combined_data)
combined_csv_path = r'C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\Data\combined_dataset_csv.csv'
combined_df.to_csv(combined_csv_path, index=False)

combined_json_path, combined_csv_path


('C:\\Users\\Joyce\\OneDrive\\桌面\\24 Fall\\CS6220\\Project\\Data\\combined_dataset.json',
 'C:\\Users\\Joyce\\OneDrive\\桌面\\24 Fall\\CS6220\\Project\\Data\\combined_dataset_csv.csv')

## Read Data

In [None]:
with open(r"C:\Users\Joyce\OneDrive\桌面\24 Fall\CS6220\Project\FilteredData\Cell_Phones_and_Accessories_filtered.json", 'r') as f:
    data = json.load(f)
    print(data)