# Feature Engineering & Selection

**Goal**: Refine features based on granular event data, create interaction terms (breadth, tech vs social, sponsor hunters), and prepare the final dataset for modeling.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [8]:
# Load Data
df = pd.read_csv('../data/knight_hacks_dues_dataset_2025.csv')
print(f"Original Shape: {df.shape}")

Original Shape: (838, 40)


## 1. Missing Value Imputation
Handling missing values for the new granular event ratios and demographics.

In [9]:
# 1.1 Event Ratios -> 0
ratio_cols = [
    'gbm_ratio', 'social_ratio', 'hello_world_ratio', 'sponsorship_ratio', 
    'tech_exploration_ratio', 'class_support_ratio', 'workshop_ratio', 
    'ops_ratio', 'collabs_ratio'
]
for col in ratio_cols:
    df[col] = df[col].fillna(0)

# 1.2 Ratings -> Mean
df['avg_event_rating_given'] = df['avg_event_rating_given'].fillna(df['avg_event_rating_given'].mean())

# 1.3 Discord Days -> -1 (if not member)
df['discord_days_since_join'] = df['discord_days_since_join'].fillna(-1)

# 1.4 Demographics -> 'Prefer not to answer'
df['gender'] = df['gender'].fillna('Prefer not to answer')
df['race_or_ethnicity'] = df['race_or_ethnicity'].fillna('Prefer not to answer')

# 1.5 Target -> 0 (assume unpaid if missing)
df['y_paid_dues'] = df['y_paid_dues'].fillna(0)

## 2. Categorical Simplification

In [10]:
# 2.1 School: UCF vs Other
def group_school(s):
    if pd.isna(s): return 'Other'
    if 'Central Florida' in s or 'UCF' in s: return 'UCF'
    return 'Other'

df['school_grouped'] = df['school'].apply(group_school)

# 2.2 Major: Tech vs Other
def group_major(m):
    if pd.isna(m): return 'Other'
    m_lower = m.lower()
    tech_keywords = ['computer', 'software', 'information', 'data', 'cyber', 'web', 'robotics', 'electrical']
    if any(k in m_lower for k in tech_keywords): return 'Tech'
    return 'Other'

df['major_grouped'] = df['major'].apply(group_major)

# 2.3 Level: Undergrad vs Grad/Other
def group_level(l):
    if pd.isna(l): return 'Other'
    if 'Undergraduate' in l: return 'Undergraduate'
    return 'Grad/Other'

df['level_grouped'] = df['level_of_study'].apply(group_level)

## 3. Numerical Feature Engineering & Interactions

In [11]:
# 3.1 Tenure in Years
df['tenure_years'] = df['member_days_since_join'] / 365.0

# 3.2 Discord Active Flag
df['is_discord_active'] = (df['discord_msgs_year'] > 0).astype(int)

# 3.3 Engagement Breadth
# Count of distinct event TYPES attended
event_type_cols = [
    'n_gbm_year', 'n_social_year', 'n_hello_world_year', 'n_sponsorship_year',
    'n_tech_exploration_year', 'n_class_support_year', 'n_workshop_year',
    'n_ops_year', 'n_collabs_year'
]
df['engagement_breadth'] = df[event_type_cols].gt(0).sum(axis=1)

# 3.4 Tech vs Social Score
# Tech: Workshops, Collabs, Hello World, Tech Exploration, Class Support
# Social: Socials, GBM
df['tech_score'] = (
    df['n_workshop_year'] + 
    df['n_collabs_year'] + 
    df['n_hello_world_year'] + 
    df['n_tech_exploration_year'] + 
    df['n_class_support_year']
)

df['social_score'] = df['n_social_year'] + df['n_gbm_year']

# Tech/Social Ratio (add 1 to den to handle 0 social)
df['tech_social_ratio'] = df['tech_score'] / (df['social_score'] + 1)

# 3.5 Sponsor Hunter Flag
# Someone who attends >0 sponsorship events and they make up >= 80% of their total events
# Avoid div by zero by using condition
def check_sponsor_hunter(row):
    if row['n_sponsorship_year'] == 0 or row['events_attended_year'] == 0:
        return 0
    ratio = row['n_sponsorship_year'] / row['events_attended_year']
    return 1 if ratio >= 0.8 else 0

df['is_sponsor_hunter'] = df.apply(check_sponsor_hunter, axis=1)

print("Sponsor Hunters Found:", df['is_sponsor_hunter'].sum())

Sponsor Hunters Found: 37


## 4. Feature Selection & Cleanup
Dropping noise and raw columns.

In [12]:
drop_cols = [
    'discordUser', 'school', 'major', 'level_of_study', 'gender', 'race_or_ethnicity',
    # Drop the raw ratios that were weak (social, workshop, sponsorship, hello world, tech, collabs, gbm)
    # We keep ops_ratio and class_support_ratio as they had signal
    'social_ratio', 'workshop_ratio', 'sponsorship_ratio', 
    'hello_world_ratio', 'tech_exploration_ratio', 'collabs_ratio', 'gbm_ratio'
]

df_clean = df.drop(columns=drop_cols)
print(f"Cleaned Shape: {df_clean.shape}")
df_clean.head()

Cleaned Shape: (838, 37)


Unnamed: 0,member_days_since_join,events_attended_year,distinct_event_days_year,attendance_streak_weeks,n_gbm_year,n_social_year,n_hello_world_year,n_sponsorship_year,n_tech_exploration_year,n_class_support_year,class_support_ratio,n_workshop_year,n_ops_year,ops_ratio,n_collabs_year,feedback_count_year,avg_event_rating_given,left_any_feedback,has_gone_to_hackathon,discord_member,discord_days_since_join,has_role_ops,discord_msgs_year,discord_active_days_year,discord_channels_posted_year,discord_num_roles,y_paid_dues,school_grouped,major_grouped,level_grouped,tenure_years,is_discord_active,engagement_breadth,tech_score,social_score,tech_social_ratio,is_sponsor_hunter
0,84,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,9.084256,0,0,1,1177.0,0,0,0,0,12,1.0,UCF,Tech,Undergraduate,0.230137,0,0,0,0,0.0,0
1,74,6,6,3,0,1,0,0,0,0,0.0,0,5,0.833333,0,0,9.084256,0,1,1,142.0,1,1594,45,9,18,1.0,UCF,Tech,Undergraduate,0.20274,1,2,0,1,0.0,0
2,64,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,9.084256,0,1,1,443.0,0,0,0,0,7,1.0,UCF,Tech,Undergraduate,0.175342,0,0,0,0,0.0,0
3,308,8,13,9,2,0,0,0,0,1,0.125,0,5,0.625,0,0,9.084256,0,1,1,449.0,1,3328,161,13,15,1.0,UCF,Tech,Undergraduate,0.843836,1,3,1,2,0.333333,0
4,308,4,8,3,1,0,0,0,1,2,0.5,0,0,0.0,0,0,9.084256,0,1,1,414.0,0,2,1,1,5,1.0,UCF,Tech,Undergraduate,0.843836,1,3,3,1,1.5,0


## 5. Check New Feature Correlations

In [13]:
new_feats = ['engagement_breadth', 'tech_score', 'social_score', 'tech_social_ratio', 'is_sponsor_hunter', 'y_paid_dues']
corr = df_clean[new_feats].corr()['y_paid_dues'].sort_values(ascending=False)
print(corr)

y_paid_dues           1.000000
engagement_breadth    0.290735
social_score          0.269967
tech_score            0.169000
tech_social_ratio     0.049995
is_sponsor_hunter    -0.047234
Name: y_paid_dues, dtype: float64


In [14]:
output_path = '../data/knight_hacks_dues_processed.csv'
df_clean.to_csv(output_path, index=False)
print(f"Saved processed data to {output_path}")

Saved processed data to ../data/knight_hacks_dues_processed.csv
