In [2]:
import os
import pandas as pd

# Base data folder
base_dir = r"C:\DSC106\PROJECT4\CR_DATA"

# Columns to keep
usecols = ["battleTime", "average.startingTrophies",
           "winner.cards.list", "loser.cards.list"]

# Daily battle CSVs we want to USE (in chronological order)
# (12/07–12/26 file is intentionally NOT included here)
battle_files = [
    os.path.join(base_dir, r"BattlesStaging_12272020_WL_tagged",
                 "battlesStaging_12272020_WL_tagged.csv"),
    os.path.join(base_dir, r"battlesStaging_12282020_WL_tagged",
                 "battlesStaging_12282020_WL_tagged.csv"),
    os.path.join(base_dir, r"BattlesStaging_12292020_WL_tagged",
                 "BattlesStaging_12292020_WL_tagged.csv"),
    os.path.join(base_dir, r"BattlesStaging_12302020_WL_tagged",
                 "BattlesStaging_12302020_WL_tagged.csv"),
    os.path.join(base_dir, r"BattlesStaging_12312020_WL_tagged",
                 "BattlesStaging_12312020_WL_tagged.csv"),
    os.path.join(base_dir, r"BattlesStaging_01012021_WL_tagged",
                 "BattlesStaging_01012021_WL_tagged.csv"),
    os.path.join(base_dir, r"BattlesStaging_01022021_WL_tagged",
                 "BattlesStaging_01022021_WL_tagged.csv"),
    os.path.join(base_dir, r"BattlesStaging_01032021_WL_tagged",
                 "BattlesStaging_01032021_WL_tagged.csv"),
    os.path.join(base_dir, r"BattlesStaging_01042021_WL_tagged",
                 "BattlesStaging_01042021_WL_tagged.csv"),
]

# How much to sample from each file (1/10 of rows)
sample_frac = 0.1

# Where to save the smaller raw dataset
output_path = os.path.join(base_dir, "BattlesStaging_small_sampled.csv")


Cell 2 – Read each file, sample 1/10 of rows, and stack

In [3]:
sampled_parts = []

for path in battle_files:
    print("Reading:", path)
    
    # Read ONLY the 4 needed columns
    df_part = pd.read_csv(
        path,
        usecols=usecols,
        engine="python",
        encoding="utf-8-sig"
    )
    
    print("  original rows:", len(df_part))
    
    # Randomly sample 1/10 of rows (set random_state for reproducibility)
    df_sample = df_part.sample(frac=sample_frac, random_state=42)
    
    print("  sampled rows:", len(df_sample))
    
    sampled_parts.append(df_sample)
    
    # Free memory from the full-day DataFrame
    del df_part

# Concatenate all sampled pieces (still in time order by file)
battles_small = pd.concat(sampled_parts, ignore_index=True)

print("Combined sampled shape:", battles_small.shape)
battles_small.head()


Reading: C:\DSC106\PROJECT4\CR_DATA\BattlesStaging_12272020_WL_tagged\battlesStaging_12272020_WL_tagged.csv
  original rows: 1911743
  sampled rows: 191174
Reading: C:\DSC106\PROJECT4\CR_DATA\battlesStaging_12282020_WL_tagged\battlesStaging_12282020_WL_tagged.csv
  original rows: 1902766
  sampled rows: 190277
Reading: C:\DSC106\PROJECT4\CR_DATA\BattlesStaging_12292020_WL_tagged\BattlesStaging_12292020_WL_tagged.csv
  original rows: 2345681
  sampled rows: 234568
Reading: C:\DSC106\PROJECT4\CR_DATA\BattlesStaging_12302020_WL_tagged\BattlesStaging_12302020_WL_tagged.csv
  original rows: 2407876
  sampled rows: 240788
Reading: C:\DSC106\PROJECT4\CR_DATA\BattlesStaging_12312020_WL_tagged\BattlesStaging_12312020_WL_tagged.csv
  original rows: 2626517
  sampled rows: 262652
Reading: C:\DSC106\PROJECT4\CR_DATA\BattlesStaging_01012021_WL_tagged\BattlesStaging_01012021_WL_tagged.csv
  original rows: 2823527
  sampled rows: 282353
Reading: C:\DSC106\PROJECT4\CR_DATA\BattlesStaging_01022021_WL_t

Unnamed: 0,battleTime,average.startingTrophies,winner.cards.list,loser.cards.list
0,2020-12-27 06:12:03+00:00,4767.5,"[26000004, 26000012, 26000017, 26000031, 26000...","[26000003, 26000015, 26000027, 26000032, 26000..."
1,2020-12-27 10:33:09+00:00,4079.0,"[26000016, 26000017, 26000026, 26000042, 26000...","[26000011, 26000012, 26000021, 26000037, 26000..."
2,2020-12-27 11:11:24+00:00,4031.5,"[26000004, 26000006, 26000012, 26000017, 28000...","[26000012, 26000015, 26000017, 26000021, 26000..."
3,2020-12-26 23:12:00+00:00,4597.5,"[26000004, 26000006, 26000007, 26000011, 26000...","[26000004, 26000036, 26000042, 26000046, 26000..."
4,2020-12-27 00:31:47+00:00,4729.5,"[26000012, 26000014, 26000015, 26000021, 26000...","[26000011, 26000021, 26000040, 26000041, 26000..."


In [2]:
battles_small.to_csv(output_path, index=False)
print("Saved sampled raw dataset to:", output_path)


NameError: name 'battles_small' is not defined