### First divide the industry by PrimaryIndustrySector, then divide the industry by time slice. Randomly sample in each time slice of each industry, while maintaining the positive and negative sample ratio in this industry in this time slice.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import pickle
import random
from pathlib import Path

SRC_CSV      = Path('../new_company.csv')
COMP_RAW     = Path('../Company.csv')
LABEL_PKL    = Path('../label_dict.pkl')
OUT_CSV      = Path('../test_data_sampled.csv')   

T_MIN, T_MAX = 177, 192    
SAMPLE_FRAC  = 1 / 3        
RANDOM_SEED  = 42          

random.seed(RANDOM_SEED)

df = pd.read_csv(SRC_CSV)          
df = df[(df['time'] >= T_MIN) & (df['time'] <= T_MAX)].copy()

comp_df = (
    pd.read_csv(
        COMP_RAW,
        usecols=['CompanyID', 'PrimaryIndustrySector']   
    )
    .rename(columns={'PrimaryIndustrySector': 'industry'})
    .drop_duplicates(subset='CompanyID')
    .set_index('CompanyID')
)
df['industry'] = df['CompanyID'].map(comp_df['industry'])

with open(LABEL_PKL, 'rb') as f:
    label_dict = pickle.load(f)     
df['label'] = df['CompanyID'].map(label_dict)

n_industries = df['industry'].nunique(dropna=True)

def sample_group(sub):
    n = len(sub)
    k = max(1, int(n * SAMPLE_FRAC))     
    return sub.sample(n=k, random_state=RANDOM_SEED)

sampled = (
    df
    .dropna(subset=['industry', 'label'])      
    .groupby(['industry', 'time', 'label'], group_keys=False)
    .apply(sample_group)
    .reset_index(drop=True)
)

sampled['label'] = sampled['label'].astype(int)

sampled[['time', 'CompanyID', 'label', 'industry']].to_csv(
    OUT_CSV, index=False, encoding='utf-8'
)
