In [9]:
# ============================================================================
# SECTION 2: DATA PREPROCESSING
# ============================================================================

In [10]:
import pandas as pd
import numpy as np

# Load raw data
print("üìÇ Loading data...")
df = pd.read_csv("Nightreign Data - NightReign Data .csv")
print(f"‚úì Dataset loaded: {df.shape[0]} runs, {df.shape[1]} variables\n")

# Display first few rows
print("Preview of raw data:")
display(df.head())

üìÇ Loading data...
‚úì Dataset loaded: 121 runs, 17 variables

Preview of raw data:


Unnamed: 0,run_id,run_index_in_day,character,difficulty,nightlord,enhanced,map,run_outcome,evergaol_cleared,middle_castle_visited,great_enemies_cleared,enemies_cleared,team_type,allies_rescued,runes_obtained,level,notes
0,1,1,recluse,depth1,Gladius,False,base,final_day,3,False,9,79,duo,8,362296,12,
1,2,2,recluse,depth1,Gladius,False,base,victory,4,True,13,99,duo,8,482476,14,
2,3,3,recluse,depth1,Caligo,False,crater,second_day,2,False,8,110,duo,4,361842,11,
3,4,4,recluse,depth1,Caligo,False,crater,second_day,2,False,8,60,trio,9,253560,10,
4,5,5,recluse,depth1,Caligo,False,crater,second_day,3,True,8,55,trio,17,282060,9,


# ============================================================================
# 2.1 DATA QUALITY CHECKS
# ============================================================================


In [11]:
print("\n" + "="*60)
print("DATA QUALITY CHECKS")
print("="*60)

# Check for missing values
print("\nüìä Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("‚úì No missing values found!")
else:
    print(missing[missing > 0])

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nüìä Duplicate Rows: {duplicates}")
if duplicates > 0:
    print(f"‚ö†Ô∏è  Removing {duplicates} duplicate rows...")
    df = df.drop_duplicates()

# Display data types
print("\nüìä Data Types:")
print(df.dtypes)

# Basic statistics
print("\nüìä Dataset Summary:")
print(f"   Total runs: {len(df)}")
print(f"   Date range: {df['run_id'].min()} to {df['run_id'].max()}")
print(f"   Unique characters: {df['character'].nunique()}")
print(f"   Difficulty levels: {df['difficulty'].nunique()}")
print(f"   Unique Nightlords: {df['nightlord'].nunique()}")


DATA QUALITY CHECKS

üìä Missing Values:
notes    116
dtype: int64

üìä Duplicate Rows: 0

üìä Data Types:
run_id                    int64
run_index_in_day          int64
character                object
difficulty               object
nightlord                object
enhanced                   bool
map                      object
run_outcome              object
evergaol_cleared          int64
middle_castle_visited      bool
great_enemies_cleared     int64
enemies_cleared           int64
team_type                object
allies_rescued            int64
runes_obtained            int64
level                     int64
notes                    object
dtype: object

üìä Dataset Summary:
   Total runs: 121
   Date range: 1 to 121
   Unique characters: 4
   Difficulty levels: 3
   Unique Nightlords: 8


# ============================================================================
# 2.2 ORDINAL CONVERSIONS
# ============================================================================

In [12]:
print("\n" + "="*60)
print("CONVERTING ORDINAL VARIABLES")
print("="*60)

# Convert run_outcome to ordinal (0-3)
outcome_mapping = {
    'first_day': 0,
    'second_day': 1,
    'final_day': 2,
    'victory': 3
}
df['run_outcome_ordinal'] = df['run_outcome'].map(outcome_mapping)
print(f"\n‚úì Converted 'run_outcome' to ordinal scale (0-3)")
print(f"   Mapping: {outcome_mapping}")

# Verify conversion
print(f"\n   Distribution check:")
print(df.groupby('run_outcome')['run_outcome_ordinal'].first())

# Convert difficulty to ordinal (1-5)
difficulty_mapping = {
    'depth1': 1,
    'depth2': 2,
    'depth3': 3,
    'depth4': 4,
    'depth5': 5
}
df['difficulty_ordinal'] = df['difficulty'].map(difficulty_mapping)
print(f"\n‚úì Converted 'difficulty' to ordinal scale (1-5)")
print(f"   Mapping: {difficulty_mapping}")

# Check difficulty distribution
print(f"\n   Difficulty distribution:")
print(df['difficulty'].value_counts().sort_index())


CONVERTING ORDINAL VARIABLES

‚úì Converted 'run_outcome' to ordinal scale (0-3)
   Mapping: {'first_day': 0, 'second_day': 1, 'final_day': 2, 'victory': 3}

   Distribution check:
run_outcome
final_day     2
first_day     0
second_day    1
victory       3
Name: run_outcome_ordinal, dtype: int64

‚úì Converted 'difficulty' to ordinal scale (1-5)
   Mapping: {'depth1': 1, 'depth2': 2, 'depth3': 3, 'depth4': 4, 'depth5': 5}

   Difficulty distribution:
difficulty
depth1    40
depth2    71
depth3    10
Name: count, dtype: int64


# ============================================================================
# 2.3 BINARY OUTCOME VARIABLE
# ============================================================================

In [13]:
print("\n" + "="*60)
print("CREATING BINARY OUTCOME VARIABLE")
print("="*60)

# Create binary victory column (0/1)
df['victory_binary'] = (df['run_outcome'] == 'victory').astype(int)

victory_rate = df['victory_binary'].mean() * 100
print(f"\n‚úì Created 'victory_binary' column")
print(f"   Overall victory rate: {victory_rate:.1f}% ({df['victory_binary'].sum()}/{len(df)} runs)")



CREATING BINARY OUTCOME VARIABLE

‚úì Created 'victory_binary' column
   Overall victory rate: 33.9% (41/121 runs)


# ============================================================================
# 2.4 FEATURE ENGINEERING
# ============================================================================


In [14]:
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)

# --- Resource Allocation Metrics ---
print("\nüìä Creating resource allocation metrics...")

# Total optional content cleared
df['total_optional_content'] = df['evergaol_cleared'] + df['great_enemies_cleared']

# Loot priority score (proxy for loot quality, replacing subjective loot_quality_score)
# Weight castle higher (3x) since it provides premium loot
df['loot_priority_score'] = (
    df['great_enemies_cleared'] + 
    (df['middle_castle_visited'].astype(int) * 3)
)

# Buff-to-loot ratio (strategy indicator)
df['buff_to_loot_ratio'] = df['evergaol_cleared'] / (df['great_enemies_cleared'] + 1)

# Risk-taking score
df['risk_taking_score'] = df['evergaol_cleared'] + df['middle_castle_visited'].astype(int)

print("   ‚úì total_optional_content")
print("   ‚úì loot_priority_score (replacing loot_quality_score)")
print("   ‚úì buff_to_loot_ratio")
print("   ‚úì risk_taking_score")

# --- Efficiency Metrics ---
print("\nüìä Creating efficiency metrics...")

# Combat efficiency (quality of kills)
df['combat_efficiency'] = df['great_enemies_cleared'] / (df['enemies_cleared'] + 1)

# Resource efficiency (runes per level)
df['runes_per_level'] = df['runes_obtained'] / (df['level'] + 1)

# Leveling efficiency (how quickly did you level up)
df['enemies_per_level'] = df['enemies_cleared'] / (df['level'] + 1)

print("   ‚úì combat_efficiency")
print("   ‚úì runes_per_level")
print("   ‚úì enemies_per_level")

# --- Session-Level Features ---
print("\nüìä Creating session-level features...")

# Calculate session ID (group consecutive runs with run_index_in_day starting from 1)
df['session_id'] = (df['run_index_in_day'] == 1).cumsum()

# Within-session cumulative stats
df = df.sort_values(['session_id', 'run_index_in_day'])
df['cumulative_wins_in_session'] = df.groupby('session_id')['victory_binary'].cumsum()
df['win_rate_so_far_in_session'] = df['cumulative_wins_in_session'] / df['run_index_in_day']

# Previous run outcome (lagged feature)
df['prev_run_outcome_ordinal'] = df.groupby('session_id')['run_outcome_ordinal'].shift(1)

# Is this after a victory?
df['after_victory'] = (df.groupby('session_id')['victory_binary'].shift(1) == 1).astype(float)

print("   ‚úì session_id")
print("   ‚úì cumulative_wins_in_session")
print("   ‚úì win_rate_so_far_in_session")
print("   ‚úì prev_run_outcome_ordinal")
print("   ‚úì after_victory")

# --- Strategy Categorization ---
print("\nüìä Categorizing play strategies...")

def categorize_strategy(row):
    """
    Categorize each run into a strategy type based on resource allocation
    """
    evergaol_threshold = df['evergaol_cleared'].quantile(0.75)  # Top 25%
    great_enemy_threshold = df['great_enemies_cleared'].median()
    
    if row['evergaol_cleared'] >= evergaol_threshold:
        return 'buff_focused'
    elif row['great_enemies_cleared'] >= great_enemy_threshold:
        return 'loot_focused'
    elif row['middle_castle_visited']:
        return 'high_risk_castle'
    else:
        return 'speedrun'

df['strategy_type'] = df.apply(categorize_strategy, axis=1)

print(f"   ‚úì strategy_type created")
print(f"\n   Strategy distribution:")
print(df['strategy_type'].value_counts())


FEATURE ENGINEERING

üìä Creating resource allocation metrics...
   ‚úì total_optional_content
   ‚úì loot_priority_score (replacing loot_quality_score)
   ‚úì buff_to_loot_ratio
   ‚úì risk_taking_score

üìä Creating efficiency metrics...
   ‚úì combat_efficiency
   ‚úì runes_per_level
   ‚úì enemies_per_level

üìä Creating session-level features...
   ‚úì session_id
   ‚úì cumulative_wins_in_session
   ‚úì win_rate_so_far_in_session
   ‚úì prev_run_outcome_ordinal
   ‚úì after_victory

üìä Categorizing play strategies...
   ‚úì strategy_type created

   Strategy distribution:
strategy_type
buff_focused        38
loot_focused        36
high_risk_castle    29
speedrun            18
Name: count, dtype: int64


# ============================================================================
# 2.5 DATA TYPE CONVERSIONS
# ============================================================================


In [15]:
print("\n" + "="*60)
print("FINALIZING DATA TYPES")
print("="*60)

# Convert boolean columns to proper type
boolean_cols = ['middle_castle_visited', 'enhanced']
for col in boolean_cols:
    if col in df.columns:
        df[col] = df[col].astype(bool)
        print(f"   ‚úì {col} ‚Üí boolean")

# Ensure categorical columns are proper type
categorical_cols = ['character', 'difficulty', 'nightlord', 'map', 
                    'run_outcome', 'team_type', 'strategy_type']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')
        print(f"   ‚úì {col} ‚Üí category")


FINALIZING DATA TYPES
   ‚úì middle_castle_visited ‚Üí boolean
   ‚úì enhanced ‚Üí boolean
   ‚úì character ‚Üí category
   ‚úì difficulty ‚Üí category
   ‚úì nightlord ‚Üí category
   ‚úì map ‚Üí category
   ‚úì run_outcome ‚Üí category
   ‚úì team_type ‚Üí category
   ‚úì strategy_type ‚Üí category


# ============================================================================
# 2.6 REMOVE OLD COLUMNS
# ============================================================================


In [16]:
print("\n" + "="*60)
print("REMOVING DEPRECATED COLUMNS")
print("="*60)

# Remove loot_quality_score if it exists (being replaced by loot_priority_score)
if 'loot_quality_score' in df.columns:
    print(f"\n‚ö†Ô∏è  Removing 'loot_quality_score' column")
    print(f"   Reason: Subjective scoring replaced by objective loot_priority_score")
    df = df.drop('loot_quality_score', axis=1)
    print(f"   ‚úì Column removed")


REMOVING DEPRECATED COLUMNS


# ============================================================================
# 2.7 FINAL DATASET SUMMARY
# ============================================================================


In [17]:
print("\n" + "="*60)
print("PROCESSED DATASET SUMMARY")
print("="*60)

print(f"\nüìä Final dataset shape: {df.shape}")
print(f"   Rows: {df.shape[0]}")
print(f"   Columns: {df.shape[1]}")

print("\nüìä New columns created:")
new_columns = [
    'run_outcome_ordinal', 'difficulty_ordinal', 'victory_binary',
    'total_optional_content', 'loot_priority_score', 'buff_to_loot_ratio',
    'risk_taking_score', 'combat_efficiency', 'runes_per_level',
    'enemies_per_level', 'session_id', 'cumulative_wins_in_session',
    'win_rate_so_far_in_session', 'prev_run_outcome_ordinal',
    'after_victory', 'strategy_type'
]
for col in new_columns:
    if col in df.columns:
        print(f"   ‚úì {col}")

print("\nüìä Column types summary:")
print(df.dtypes.value_counts())



PROCESSED DATASET SUMMARY

üìä Final dataset shape: (121, 33)
   Rows: 121
   Columns: 33

üìä New columns created:
   ‚úì run_outcome_ordinal
   ‚úì difficulty_ordinal
   ‚úì victory_binary
   ‚úì total_optional_content
   ‚úì loot_priority_score
   ‚úì buff_to_loot_ratio
   ‚úì risk_taking_score
   ‚úì combat_efficiency
   ‚úì runes_per_level
   ‚úì enemies_per_level
   ‚úì session_id
   ‚úì cumulative_wins_in_session
   ‚úì win_rate_so_far_in_session
   ‚úì prev_run_outcome_ordinal
   ‚úì after_victory
   ‚úì strategy_type

üìä Column types summary:
int64       16
float64      7
bool         2
category     1
category     1
category     1
category     1
category     1
category     1
object       1
category     1
Name: count, dtype: int64


# ============================================================================
# 2.8 SAVE PROCESSED DATA
# ============================================================================


In [18]:
print("\n" + "="*60)
print("SAVING PROCESSED DATA")
print("="*60)

# Create output directory if it doesn't exist
import os
os.makedirs('../data/processed', exist_ok=True)

# Save processed dataset
output_path = '../data/processed/nightreign_processed.csv'
df.to_csv(output_path, index=False)
print(f"\n‚úì Processed data saved to: {output_path}")

# Also save a version with only analysis-ready columns
analysis_cols = [
    # Identifiers
    'run_id', 'session_id', 'run_index_in_day',
    # Outcomes (ordinal + binary)
    'run_outcome', 'run_outcome_ordinal', 'victory_binary',
    # Game context
    'character', 'difficulty', 'difficulty_ordinal', 'nightlord', 
    'enhanced', 'map', 'team_type',
    # Resource allocation
    'evergaol_cleared', 'middle_castle_visited', 'great_enemies_cleared',
    # Performance metrics
    'enemies_cleared', 'allies_rescued', 'runes_obtained', 'level',
    # Engineered features
    'total_optional_content', 'loot_priority_score', 'buff_to_loot_ratio',
    'risk_taking_score', 'combat_efficiency', 'runes_per_level',
    'strategy_type', 'after_victory'
]

df_analysis = df[[col for col in analysis_cols if col in df.columns]]
analysis_path = '../data/processed/nightreign_analysis.csv'
df_analysis.to_csv(analysis_path, index=False)
print(f"‚úì Analysis-ready data saved to: {analysis_path}")

print("\n" + "="*60)
print("‚úÖ DATA PREPROCESSING COMPLETE!")
print("="*60)

# Display final preview
print("\nPreview of processed data:")
display(df.head())

print("\nüìä Quick statistics:")
print(f"   Victory rate: {df['victory_binary'].mean()*100:.1f}%")
print(f"   Average run outcome: {df['run_outcome_ordinal'].mean():.2f} (0=first_day, 3=victory)")
print(f"   Average difficulty: {df['difficulty_ordinal'].mean():.2f} (1-5 scale)")
print(f"   Most played character: {df['character'].mode()[0]}")
print(f"   Most common strategy: {df['strategy_type'].mode()[0]}")


SAVING PROCESSED DATA

‚úì Processed data saved to: ../data/processed/nightreign_processed.csv
‚úì Analysis-ready data saved to: ../data/processed/nightreign_analysis.csv

‚úÖ DATA PREPROCESSING COMPLETE!

Preview of processed data:


Unnamed: 0,run_id,run_index_in_day,character,difficulty,nightlord,enhanced,map,run_outcome,evergaol_cleared,middle_castle_visited,...,risk_taking_score,combat_efficiency,runes_per_level,enemies_per_level,session_id,cumulative_wins_in_session,win_rate_so_far_in_session,prev_run_outcome_ordinal,after_victory,strategy_type
0,1,1,recluse,depth1,Gladius,False,base,final_day,3,False,...,3,0.1125,27868.923077,6.076923,1,0,0.0,,0.0,speedrun
1,2,2,recluse,depth1,Gladius,False,base,victory,4,True,...,5,0.13,32165.066667,6.6,1,1,0.5,2.0,0.0,loot_focused
2,3,3,recluse,depth1,Caligo,False,crater,second_day,2,False,...,2,0.072072,30153.5,9.166667,1,1,0.333333,3.0,1.0,speedrun
3,4,4,recluse,depth1,Caligo,False,crater,second_day,2,False,...,2,0.131148,23050.909091,5.454545,1,1,0.25,1.0,0.0,speedrun
4,5,5,recluse,depth1,Caligo,False,crater,second_day,3,True,...,4,0.142857,28206.0,5.5,1,1,0.2,1.0,0.0,high_risk_castle



üìä Quick statistics:
   Victory rate: 33.9%
   Average run outcome: 1.97 (0=first_day, 3=victory)
   Average difficulty: 1.75 (1-5 scale)
   Most played character: recluse
   Most common strategy: buff_focused
