## 1. Load Dataset


In [None]:
# Get all CSV files
csv_files = sorted(DATASET_DIR.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files")

# Load all CSV files
dataframes = []
for file in csv_files:
    print(f"Loading {file.name}...")
    try:
        df_temp = pd.read_csv(file, low_memory=False)
        # Strip whitespace from column names
        df_temp.columns = df_temp.columns.str.strip()
        print(f"  Shape: {df_temp.shape}, Columns: {len(df_temp.columns)}")
        dataframes.append(df_temp)
    except Exception as e:
        print(f"  Error loading {file.name}: {e}")

print(f"\nTotal dataframes loaded: {len(dataframes)}")


In [None]:
# Combine all dataframes
print("Combining all dataframes...")
df = pd.concat(dataframes, ignore_index=True)
print(f"Combined dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


## 2. Explore Labels and Handle Class Imbalance


In [None]:
# Find label column
label_col = None
for col in df.columns:
    if col.strip().upper() == 'LABEL':
        label_col = col
        break

if not label_col:
    raise ValueError("Label column not found!")

print("Label distribution (all classes):")
print("="*70)
label_counts = df[label_col].value_counts().sort_index()
print(label_counts)
print(f"\nTotal unique labels: {len(label_counts)}")
print(f"\nClass distribution percentages:")
print((label_counts / len(df) * 100).round(2))


In [None]:
# Check for very rare classes (less than 100 samples)
print("\nChecking for very rare classes (less than 100 samples)...")
rare_classes = label_counts[label_counts < 100]
print(f"\nRare classes (< 100 samples): {len(rare_classes)}")
if len(rare_classes) > 0:
    print(rare_classes)
    print(f"\nTotal samples in rare classes: {rare_classes.sum()}")
    print("\nStrategy: We'll combine rare classes into 'Other' category")


## 3. Preprocess Data


In [None]:
# Create a copy for preprocessing
df_processed = df.copy()
print(f"Starting preprocessing with shape: {df_processed.shape}")

# Ensure column names are stripped
df_processed.columns = df_processed.columns.str.strip()

# Find label column
label_col = None
for col in df_processed.columns:
    if col.strip().upper() == 'LABEL':
        label_col = col
        break

# Handle infinite values - replace with NaN first, then handle NaN
print("Handling infinite values...")
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df_processed[col] = df_processed[col].replace([np.inf, -np.inf], np.nan)

# Handle missing values - drop columns with too many missing values (>50%)
print("Handling missing values...")
missing_threshold = 0.5
cols_to_drop = []
for col in df_processed.columns:
    if col != label_col and label_col is not None:
        missing_pct = df_processed[col].isnull().sum() / len(df_processed)
        if missing_pct > missing_threshold:
            cols_to_drop.append(col)

if cols_to_drop:
    print(f"Dropping {len(cols_to_drop)} columns with >50% missing values")
    df_processed = df_processed.drop(columns=cols_to_drop)

# For remaining numeric columns with missing values, fill with median
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col].fillna(df_processed[col].median(), inplace=True)

# Drop rows where Label is missing
if label_col:
    initial_rows = len(df_processed)
    df_processed = df_processed.dropna(subset=[label_col])
    dropped_rows = initial_rows - len(df_processed)
    if dropped_rows > 0:
        print(f"Dropped {dropped_rows} rows with missing labels")

print(f"\nAfter preprocessing shape: {df_processed.shape}")


In [None]:
# Handle class imbalance: combine very rare classes (less than 100 samples) into "Other"
print("Handling class imbalance...")
label_counts_processed = df_processed[label_col].value_counts()
rare_threshold = 100
rare_classes_list = label_counts_processed[label_counts_processed < rare_threshold].index.tolist()

if len(rare_classes_list) > 0:
    print(f"\nCombining {len(rare_classes_list)} rare classes into 'Other':")
    for cls in rare_classes_list:
        print(f"  - {cls}: {label_counts_processed[cls]} samples")
    
    # Replace rare classes with "Other"
    df_processed['Label_Multiclass'] = df_processed[label_col].copy()
    df_processed.loc[df_processed[label_col].isin(rare_classes_list), 'Label_Multiclass'] = 'Other'
    
    print(f"\nAfter combining rare classes:")
    print(df_processed['Label_Multiclass'].value_counts().sort_index())
else:
    print("No rare classes found, using original labels")
    df_processed['Label_Multiclass'] = df_processed[label_col].copy()


In [None]:
# Separate features and target
cols_to_drop = ['Label_Multiclass']
if label_col:
    cols_to_drop.append(label_col)

X = df_processed.drop(columns=cols_to_drop)
y = df_processed['Label_Multiclass']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nNumber of classes: {y.nunique()}")
print(f"\nClass distribution:")
print(y.value_counts().sort_index())
