# ============================================================================
# SECTION 2: DATA PREPROCESSING
# ============================================================================

In [None]:
import pandas as pd
import numpy as np

print("üìÇ Loading data...")
df = pd.read_csv('../data/raw/nightreign_data.csv')
print(f"‚úì Dataset loaded: {df.shape[0]} runs, {df.shape[1]} variables\n")

print("Preview of raw data:")
display(df.head())

## 2.1 DATA QUALITY CHECKS

In [None]:
print("\n" + "="*60)
print("DATA QUALITY CHECKS")
print("="*60)

print("\nüìä Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("‚úì No missing values found!")
else:
    print(missing[missing > 0])

duplicates = df.duplicated().sum()
print(f"\nüìä Duplicate Rows: {duplicates}")
if duplicates > 0:
    print(f"‚ö†Ô∏è  Removing {duplicates} duplicate rows...")
    df = df.drop_duplicates()

print("\nüìä Data Types:")
print(df.dtypes)

print("\nüìä Dataset Summary:")
print(f"   Total runs: {len(df)}")
print(f"   Date range: {df['run_id'].min()} to {df['run_id'].max()}")
print(f"   Unique characters: {df['character'].nunique()}")
print(f"   Difficulty levels: {df['difficulty'].nunique()}")
print(f"   Unique Nightlords: {df['nightlord'].nunique()}")

## 2.2 ORDINAL CONVERSIONS

In [None]:
print("\n" + "="*60)
print("CONVERTING ORDINAL VARIABLES")
print("="*60)

outcome_mapping = {
    'first_day': 0,
    'second_day': 1,
    'final_day': 2,
    'victory': 3
}
df['run_outcome_ordinal'] = df['run_outcome'].map(outcome_mapping)
print(f"\n‚úì Converted 'run_outcome' to ordinal scale (0-3)")
print(f"   Mapping: {outcome_mapping}")

print(f"\n   Distribution check:")
print(df.groupby('run_outcome')['run_outcome_ordinal'].first())

difficulty_mapping = {
    'depth1': 1,
    'depth2': 2,
    'depth3': 3,
    'depth4': 4,
    'depth5': 5
}
df['difficulty_ordinal'] = df['difficulty'].map(difficulty_mapping)
print(f"\n‚úì Converted 'difficulty' to ordinal scale (1-5)")
print(f"   Mapping: {difficulty_mapping}")

print(f"\n   Difficulty distribution:")
print(df['difficulty'].value_counts().sort_index())

## 2.3 BINARY OUTCOME VARIABLE

In [None]:
print("\n" + "="*60)
print("CREATING BINARY OUTCOME VARIABLE")
print("="*60)

df['victory_binary'] = (df['run_outcome'] == 'victory').astype(int)
victory_rate = df['victory_binary'].mean() * 100

print(f"\n‚úì Created 'victory_binary' column")
print(f"   Overall victory rate: {victory_rate:.1f}% ({df['victory_binary'].sum()}/{len(df)} runs)")

## 2.4 FEATURE ENGINEERING

In [None]:
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)

print("\nüìä Creating resource allocation metrics...")

df['total_optional_content'] = df['evergaol_cleared'] + df['great_enemies_cleared']

df['loot_priority_score'] = (
    df['great_enemies_cleared'] + 
    (df['middle_castle_visited'].astype(int) * 3)
)

df['buff_to_loot_ratio'] = df['evergaol_cleared'] / (df['great_enemies_cleared'] + 1)

df['risk_taking_score'] = df['evergaol_cleared'] + df['middle_castle_visited'].astype(int)

print("   ‚úì total_optional_content")
print("   ‚úì loot_priority_score")
print("   ‚úì buff_to_loot_ratio")
print("   ‚úì risk_taking_score")

print("\nüìä Creating efficiency metrics...")
df['combat_efficiency'] = df['great_enemies_cleared'] / (df['enemies_cleared'] + 1)
df['runes_per_level'] = df['runes_obtained'] / (df['level'] + 1)
df['enemies_per_level'] = df['enemies_cleared'] / (df['level'] + 1)

print("   ‚úì combat_efficiency")
print("   ‚úì runes_per_level")
print("   ‚úì enemies_per_level")

print("\nüìä Creating session-level features...")
df['session_id'] = (df['run_index_in_day'] == 1).cumsum()

df = df.sort_values(['session_id', 'run_index_in_day'])
df['cumulative_wins_in_session'] = df.groupby('session_id')['victory_binary'].cumsum()
df['win_rate_so_far_in_session'] = df['cumulative_wins_in_session'] / df['run_index_in_day']

df['prev_run_outcome_ordinal'] = df.groupby('session_id')['run_outcome_ordinal'].shift(1)
df['after_victory'] = (df.groupby('session_id')['victory_binary'].shift(1) == 1).astype(float)

print("   ‚úì session_id")
print("   ‚úì cumulative_wins_in_session")
print("   ‚úì win_rate_so_far_in_session")
print("   ‚úì prev_run_outcome_ordinal")
print("   ‚úì after_victory")

print("\nüìä Categorizing play strategies...")

def categorize_strategy(row):
    evergaol_threshold = df['evergaol_cleared'].quantile(0.75)
    great_enemy_threshold = df['great_enemies_cleared'].median()
    
    if row['evergaol_cleared'] >= evergaol_threshold:
        return 'buff_focused'
    elif row['great_enemies_cleared'] >= great_enemy_threshold:
        return 'loot_focused'
    elif row['middle_castle_visited']:
        return 'high_risk_castle'
    else:
        return 'speedrun'

df['strategy_type'] = df.apply(categorize_strategy, axis=1)

print("   ‚úì strategy_type created")
print("\n   Strategy distribution:")
print(df['strategy_type'].value_counts())

## 2.5 DATA TYPE CONVERSIONS

In [None]:
print("\n" + "="*60)
print("FINALIZING DATA TYPES")
print("="*60)

boolean_cols = ['middle_castle_visited', 'enhanced']
for col in boolean_cols:
    if col in df.columns:
        df[col] = df[col].astype(bool)
        print(f"   ‚úì {col} ‚Üí boolean")

categorical_cols = ['character', 'difficulty', 'nightlord', 'map', 
                    'run_outcome', 'team_type', 'strategy_type']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')
        print(f"   ‚úì {col} ‚Üí category")

## 2.6 REMOVE OLD COLUMNS

In [None]:
print("\n" + "="*60)
print("REMOVING DEPRECATED COLUMNS")
print("="*60)

if 'loot_quality_score' in df.columns:
    print(f"\n‚ö†Ô∏è  Removing 'loot_quality_score' column")
    print(f"   Reason: Replaced by loot_priority_score")
    df = df.drop('loot_quality_score', axis=1)
    print(f"   ‚úì Column removed")

## 2.7 FINAL DATASET SUMMARY

In [None]:
print("\n" + "="*60)
print("PROCESSED DATASET SUMMARY")
print("="*60)

print(f"\nüìä Final dataset shape: {df.shape}")
print(f"   Rows: {df.shape[0]}")
print(f"   Columns: {df.shape[1]}")

print("\nüìä New columns created:")
new_columns = [
    'run_outcome_ordinal', 'difficulty_ordinal', 'victory_binary',
    'total_optional_content', 'loot_priority_score', 'buff_to_loot_ratio',
    'risk_taking_score', 'combat_efficiency', 'runes_per_level',
    'enemies_per_level', 'session_id', 'cumulative_wins_in_session',
    'win_rate_so_far_in_session', 'prev_run_outcome_ordinal',
    'after_victory', 'strategy_type'
]
for col in new_columns:
    if col in df.columns:
        print(f"   ‚úì {col}")

print("\nüìä Column types summary:")
print(df.dtypes.value_counts())

## 2.8 SAVE PROCESSED DATA

In [None]:
print("\n" + "="*60)
print("SAVING PROCESSED DATA")
print("="*60)

import os
os.makedirs('../data/processed', exist_ok=True)

output_path = '../data/processed/nightreign_processed.csv'
df.to_csv(output_path, index=False)
print(f"\n‚úì Processed data saved to: {output_path}")

analysis_cols = [
    'run_id', 'session_id', 'run_index_in_day',
    'run_outcome', 'run_outcome_ordinal', 'victory_binary',
    'character', 'difficulty', 'difficulty_ordinal', 'nightlord', 
    'enhanced', 'map', 'team_type',
    'evergaol_cleared', 'middle_castle_visited', 'great_enemies_cleared',
    'enemies_cleared', 'allies_rescued', 'runes_obtained', 'level',
    'total_optional_content', 'loot_priority_score', 'buff_to_loot_ratio',
    'risk_taking_score', 'combat_efficiency', 'runes_per_level',
    'strategy_type', 'after_victory'
]

df_analysis = df[[col for col in analysis_cols if col in df.columns]]
analysis_path = '../data/processed/nightreign_analysis.csv'
df_analysis.to_csv(analysis_path, index=False)
print(f"‚úì Analysis-ready data saved to: {analysis_path}")

print("\n" + "="*60)
print("‚úÖ DATA PREPROCESSING COMPLETE!")
print("="*60)

print("\nPreview of processed data:")
display(df.head())

print("\nüìä Quick statistics:")
print(f"   Victory rate: {df['victory_binary'].mean()*100:.1f}%")
print(f"   Average run outcome: {df['run_outcome_ordinal'].mean():.2f}")
print(f"   Average difficulty: {df['difficulty_ordinal'].mean():.2f}")
print(f"   Most played character: {df['character'].mode()[0]}")
print(f"   Most common strategy: {df['strategy_type'].mode()[0]}")