In [195]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sqlalchemy import create_engine, inspect

In [196]:
train_main = pd.read_csv('data/ddakji_level_4_main_train.csv')
test_main = pd.read_csv('data/ddakji_level_4_test.csv')

In [197]:
db_url = 'postgresql://td_participant:datathon!@45.79.28.247:5432/ddakji_db'
engine = create_engine(db_url)

In [198]:
inspector = inspect(engine)
table_names = inspector.get_table_names()

print("Tables available in the database:")
for table in table_names:
    print(f"- {table}")

Tables available in the database:
- level4
- level5


In [199]:
train_supp = pd.read_sql('SELECT * FROM level4', engine)

# Merge training datasets
train_merged = pd.merge(train_main, train_supp, on='Throw_IDs', how='left')

In [200]:
train_merged['Times_Practiced'] = train_merged.apply(
    lambda row: row['Times_Practiced_x'] 
                if row['Times_Practiced_x'] == row['Times_Practiced_y'] 
                else row['Times_Practiced_x'],
    axis=1
)
train_merged.drop(columns=['Times_Practiced_x', 'Times_Practiced_y'], inplace=True)

# --- Step 5: Match columns in test (without target) carefully ---
missing_cols_in_test = set(train_merged.columns) - set(test_main.columns) - {'Flip_Result'}
for col in missing_cols_in_test:
    if train_merged[col].dtype in ['float64', 'int64']:
        test_main[col] = train_merged[col].median()
    else:
        test_main[col] = 'Unknown'

# Fix "Times_Practiced" in test_main explicitly if needed
if 'Times_Practiced_x' in test_main.columns:
    test_main.rename(columns={'Times_Practiced_x': 'Times_Practiced'}, inplace=True)

test_main = test_main[train_merged.drop(columns='Flip_Result').columns]


In [201]:
train_merged.head(10)

Unnamed: 0,Throw_IDs,Player_Consistency_Score,Cumulative_Impact_Force,Throws_Per_Week,Throw_Technique_Style,Player_Experience_Level,Throw_Accuracy_Deviation,Flip_Result,Times_Adjusted_Grip,Impact_Point,Times_Practiced
0,102500,14.0,751.0,7.0,Steady_Push,Rookie (0-25),10.0,Yes,5.0,Face_Hit,95248.0
1,101073,32.0,972.0,23.0,Quick_Flick,Veteran (46-65),0.0,Yes,5.0,Edge_Hit,28826.0
2,100095,5.0,164.0,7.0,Steady_Push,Experienced (26-45),0.0,Yes,4.0,Corner_Hit,91146.0
3,101077,12.0,556.0,16.0,Calculated_Lob,Experienced (26-45),1.0,Yes,5.0,Edge_Hit,41586.0
4,100872,34.0,835.0,13.0,Steady_Push,Experienced (26-45),23.0,Yes,1.0,Corner_Hit,135391.0
5,102922,32.0,158.0,2.0,Steady_Push,Veteran (46-65),0.0,Yes,1.0,Face_Hit,127876.0
6,100113,39.0,998.26,17.0,Steady_Push,Experienced (26-45),3.0,No,0.0,Corner_Hit,423038.0
7,102373,26.0,487.0,27.0,Calculated_Lob,Experienced (26-45),7.0,Yes,0.0,Corner_Hit,200880.0
8,100812,36.0,966.48,22.0,Calculated_Lob,Experienced (26-45),18.0,No,1.0,Edge_Hit,362926.0
9,103089,59.0,663.87,18.0,Calculated_Lob,Veteran (46-65),1.0,No,4.0,Face_Hit,314695.0


In [202]:
# missing_cols_in_test = set(train_merged.columns) - set(test_main.columns) - {'Flip_Result'}
# for col in missing_cols_in_test:
#     test_main[col] = 0  # default numeric fill (0 is usually fine)

In [203]:
# test_main = test_main[train_merged.drop(columns='Flip_Result').columns]
categorical_cols = ['Throw_Technique_Style', 'Player_Experience_Level', 'Impact_Point']

In [204]:
test_main.head()

Unnamed: 0,Throw_IDs,Player_Consistency_Score,Cumulative_Impact_Force,Throws_Per_Week,Throw_Technique_Style,Player_Experience_Level,Throw_Accuracy_Deviation,Times_Adjusted_Grip,Impact_Point,Times_Practiced
0,100439,24.0,692.0,7.0,Quick_Flick,Veteran (46-65),29.0,9.0,Face_Hit,164271.0
1,100206,47.0,656.42,27.0,Calculated_Lob,Experienced (26-45),15.0,0.0,Face_Hit,333724.0
2,100375,19.0,567.61,29.0,Steady_Push,Veteran (46-65),8.0,1.0,Corner_Hit,427302.0
3,100561,20.0,997.47,16.0,Steady_Push,Experienced (26-45),0.0,1.0,Edge_Hit,397029.0
4,100493,36.0,329.0,30.0,Steady_Push,Experienced (26-45),23.0,5.0,Edge_Hit,44943.0


Numeric Columns

In [205]:
numeric_cols = train_merged.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols.remove('Throw_IDs')  # ID shouldn't be imputed

In [206]:
combined_df = pd.concat([train_merged, test_main], keys=['train', 'test'])
# one hot
combined_df_encoded = pd.get_dummies(combined_df, columns=categorical_cols, drop_first=True)


In [207]:
train_encoded = combined_df_encoded.loc['train'].copy()
test_encoded = combined_df_encoded.loc['test'].copy()

In [208]:
numeric_cols = train_encoded.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols.remove('Throw_IDs')  # Don't impute Throw_IDs.

imputer = SimpleImputer(strategy='mean')

train_encoded.loc[:, numeric_cols] = imputer.fit_transform(train_encoded.loc[:, numeric_cols])
test_encoded.loc[:, numeric_cols] = imputer.transform(test_encoded.loc[:, numeric_cols])


In [209]:
train_encoded['Flip_Result'] = train_encoded['Flip_Result'].map({'Yes': 1, 'No': 0})

# --- Step 11: Ensure no NaNs in target ---
train_encoded = train_encoded.dropna(subset=['Flip_Result'])


In [210]:
X_train = train_encoded.drop(columns=['Throw_IDs', 'Flip_Result'])
y_train = train_encoded['Flip_Result']

X_test = test_encoded.drop(columns=['Throw_IDs', 'Flip_Result'])


In [211]:
model = RandomForestClassifier(n_estimators=150, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [212]:
train_preds = model.predict(X_train)
accuracy = accuracy_score(y_train, train_preds)
print(f"✅ Training Accuracy: {accuracy:.2%}")

# Step 12: Make predictions for test set
test_preds = model.predict(X_test)
test_preds_labels = ['Yes' if pred == 1 else 'No' for pred in test_preds]

# Step 13: Export submission CSV
submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],
    'Flip_Result': test_preds_labels
})

submission.to_csv('submission.csv', index=False)
print("✅ Submission CSV created successfully!")

✅ Training Accuracy: 100.00%
✅ Submission CSV created successfully!


### DB DEBUGGING

In [213]:
import pandas as pd
from sqlalchemy import create_engine

# Connect to your database
db_url = 'postgresql://td_participant:datathon!@45.79.28.247:5432/ddakji_db'
engine = create_engine(db_url)

# Load main CSV
train_main = pd.read_csv('data/ddakji_level_4_main_train.csv')

# Load supplemental SQL data
train_supp = pd.read_sql('SELECT * FROM level4', engine)

# Merge datasets
train_merged = pd.merge(train_main, train_supp, on='Throw_IDs', how='left')

# Inspect merged DataFrame thoroughly
print(train_merged.head())
print(train_merged.info())
print(train_merged.isnull().sum())

# Save merged data temporarily if needed
train_merged.to_csv('merged_train_debug.csv', index=False)


   Throw_IDs  Times_Practiced_x  Player_Consistency_Score  \
0     102500            95248.0                      14.0   
1     101073            28826.0                      32.0   
2     100095            91146.0                       5.0   
3     101077            41586.0                      12.0   
4     100872           135391.0                      34.0   

   Cumulative_Impact_Force  Throws_Per_Week Throw_Technique_Style  \
0                    751.0              7.0           Steady_Push   
1                    972.0             23.0           Quick_Flick   
2                    164.0              7.0           Steady_Push   
3                    556.0             16.0        Calculated_Lob   
4                    835.0             13.0           Steady_Push   

  Player_Experience_Level  Throw_Accuracy_Deviation Flip_Result  \
0           Rookie (0-25)                      10.0         Yes   
1         Veteran (46-65)                       0.0         Yes   
2     Experienced

In [214]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Step 1: Load Data ---
train_main = pd.read_csv('data/ddakji_level_4_main_train.csv')
test_main = pd.read_csv('data/ddakji_level_4_test.csv')

# --- Step 2: SQL Connection & Supplemental Data ---
db_url = 'postgresql://td_participant:datathon!@45.79.28.247:5432/ddakji_db'
engine = create_engine(db_url)
train_supp = pd.read_sql('SELECT * FROM level4', engine)

# --- Step 3: Merge Supplemental Data ---
train_merged = pd.merge(train_main, train_supp, on='Throw_IDs', how='left')

# --- Step 4: Clearly Handle "Times_Practiced" ---
train_merged['Times_Practiced'] = train_merged.apply(
    lambda row: row['Times_Practiced_x'] 
                if row['Times_Practiced_x'] == row['Times_Practiced_y'] 
                else (row['Times_Practiced_x']),
    axis=1
)
train_merged.drop(columns=['Times_Practiced_x', 'Times_Practiced_y'], inplace=True)

# --- Step 5: Match columns in test (without target) carefully ---
missing_cols_in_test = set(train_merged.columns) - set(test_main.columns) - {'Flip_Result'}
for col in missing_cols_in_test:
    if train_merged[col].dtype in ['float64', 'int64']:
        test_main[col] = train_merged[col].median()
    else:
        test_main[col] = 'Unknown'

# Fix "Times_Practiced" in test_main explicitly if needed
if 'Times_Practiced_x' in test_main.columns:
    test_main.rename(columns={'Times_Practiced_x': 'Times_Practiced'}, inplace=True)

test_main = test_main[train_merged.drop(columns='Flip_Result').columns]

# --- Step 6: Identify categorical columns explicitly ---
categorical_cols = ['Throw_Technique_Style', 'Player_Experience_Level', 'Impact_Point']

# --- Step 7: Combine datasets ---
combined_df = pd.concat([train_merged, test_main], keys=['train', 'test'])

# --- Step 8: One-Hot Encoding ---
combined_df_encoded = pd.get_dummies(combined_df, columns=categorical_cols, drop_first=True)

# --- Step 9: Split back explicitly ---
train_encoded = combined_df_encoded.loc['train'].copy()
test_encoded = combined_df_encoded.loc['test'].copy()

# --- Step 10: Numeric columns and Imputation ---
numeric_cols = train_encoded.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols.remove('Throw_IDs')

imputer = SimpleImputer(strategy='mean')

train_encoded.loc[:, numeric_cols] = imputer.fit_transform(train_encoded.loc[:, numeric_cols])
test_encoded.loc[:, numeric_cols] = imputer.transform(test_encoded.loc[:, numeric_cols])

# --- Step 11: Encode target explicitly ---
train_encoded['Flip_Result'] = train_encoded['Flip_Result'].map({'Yes': 1, 'No': 0})
train_encoded = train_encoded.dropna(subset=['Flip_Result'])

# --- Step 12: Prepare X and y explicitly ---
X_train = train_encoded.drop(columns=['Throw_IDs', 'Flip_Result'])
y_train = train_encoded['Flip_Result']
X_test = test_encoded.drop(columns=['Throw_IDs', 'Flip_Result'], errors='ignore')

# --- Step 13: Train model (balanced classes) ---
model = RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# --- Step 14: Verify accuracy ---
train_preds = model.predict(X_train)
print(f"✅ Training Accuracy: {accuracy_score(y_train, train_preds):.2%}")

# --- Step 15: Generate predictions explicitly ---
test_preds = model.predict(X_test)
test_preds_labels = ['Yes' if pred == 1 else 'No' for pred in test_preds]

# --- Step 16: Export final submission ---
submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],
    'Flip_Result': test_preds_labels
})

submission.to_csv('submission.csv', index=False)
print("✅ Submission CSV created successfully!")
print(submission['Flip_Result'].value_counts())


✅ Training Accuracy: 100.00%
✅ Submission CSV created successfully!
Flip_Result
Yes    466
No     334
Name: count, dtype: int64


In [215]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# --- Step 13: Train both models ---
rf_model = RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced')
gb_model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, random_state=42)

rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# --- Step 14: Evaluate training performance of each model (optional) ---
from sklearn.metrics import accuracy_score

rf_preds_train = rf_model.predict(X_train)
gb_preds_train = gb_model.predict(X_train)
rf_acc = accuracy_score(y_train, rf_preds_train)
gb_acc = accuracy_score(y_train, gb_preds_train)

print(f"✅ Random Forest Training Accuracy: {rf_acc:.2%}")
print(f"✅ Gradient Boosting Training Accuracy: {gb_acc:.2%}")

# --- Step 15: Ensemble predictions on test set ---
rf_probs = rf_model.predict_proba(X_test)[:, 1]  # Probability of class "1" (Yes)
gb_probs = gb_model.predict_proba(X_test)[:, 1]

# Average the probabilities
ensemble_probs = (rf_probs + gb_probs) / 2

# Final prediction: probability > 0.5 → "Yes", otherwise "No"
final_preds = ['Yes' if prob > 0.5 else 'No' for prob in ensemble_probs]

# --- Step 16: Create final submission.csv ---
submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],
    'Flip_Result': final_preds
})

submission.to_csv('submission.csv', index=False)
print("✅ Ensemble submission.csv created successfully!")
print(submission['Flip_Result'].value_counts())


✅ Random Forest Training Accuracy: 100.00%
✅ Gradient Boosting Training Accuracy: 100.00%
✅ Ensemble submission.csv created successfully!
Flip_Result
Yes    466
No     334
Name: count, dtype: int64


In [216]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np


In [217]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        inputs = inputs.clamp(min=1e-7, max=1 - 1e-7)  # avoid log(0)
        BCE = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.where(targets == 1, inputs, 1 - inputs)
        loss = self.alpha * (1 - pt) ** self.gamma * BCE
        return loss.mean()

In [218]:
# --- Step 1: Convert your final cleaned data into tensors ---
X_train_np = X_train.values.astype(np.float32)
y_train_np = y_train.values.astype(np.float32).reshape(-1, 1)

X_test_np = X_test.values.astype(np.float32)

# Split train into train/val
X_tr, X_val, y_tr, y_val = train_test_split(X_train_np, y_train_np, test_size=0.15, random_state=42)

# Torch datasets
train_ds = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr))
val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
test_tensor = torch.tensor(X_test_np)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# --- Step 2: Define the neural network ---
class FlipNet(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.drop1 = nn.Dropout(0.2)
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.drop2 = nn.Dropout(0.2)
        
        self.out = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        return torch.sigmoid(self.out(x))

# --- Step 3: Training Setup ---
model = FlipNet(X_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
# loss_fn = nn.BCELoss()
loss_fn = FocalLoss(alpha=0.15, gamma=2)

best_val_acc = 0
patience, patience_limit = 0, 10

# --- Step 4: Train the model ---
for epoch in range(1000):
    model.train()
    for xb, yb in train_loader:
        preds = model(xb)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation accuracy
    model.eval()
    with torch.no_grad():
        val_preds = model(torch.tensor(X_val))
        val_pred_labels = (val_preds.numpy() > 0.5).astype(int)
        acc = accuracy_score(y_val, val_pred_labels)

    print(f"Epoch {epoch+1}: Val Accuracy = {acc:.4f}")

    # # Early stopping
    # if acc > best_val_acc:
    if acc > best_val_acc:
        best_val_acc = acc
        best_model_state = model.state_dict()
        patience = 0
    else:
        patience += 1
        if patience >= patience_limit:
            print("Early stopping triggered.")
            break

# --- Step 5: Load best model and predict on test set ---
model.load_state_dict(best_model_state)
model.eval()
with torch.no_grad():
    test_probs = model(test_tensor).numpy().flatten()
    test_labels = ['Yes' if p > 0.5 else 'No' for p in test_probs]

# --- Step 6: Create submission.csv ---
submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],
    'Flip_Result': test_labels
})

submission.to_csv('submission.csv', index=False)
print("✅ Neural network submission.csv created successfully!")
print(submission['Flip_Result'].value_counts())


Epoch 1: Val Accuracy = 0.9812
Epoch 2: Val Accuracy = 0.9250
Epoch 3: Val Accuracy = 0.9646
Epoch 4: Val Accuracy = 0.9708
Epoch 5: Val Accuracy = 0.9208
Epoch 6: Val Accuracy = 0.9542
Epoch 7: Val Accuracy = 0.9688
Epoch 8: Val Accuracy = 0.9875
Epoch 9: Val Accuracy = 0.9667
Epoch 10: Val Accuracy = 0.8833
Epoch 11: Val Accuracy = 0.9792
Epoch 12: Val Accuracy = 0.9875
Epoch 13: Val Accuracy = 0.9646
Epoch 14: Val Accuracy = 0.9417
Epoch 15: Val Accuracy = 0.9458
Epoch 16: Val Accuracy = 0.9729
Epoch 17: Val Accuracy = 0.9792
Epoch 18: Val Accuracy = 0.9667
Early stopping triggered.
✅ Neural network submission.csv created successfully!
Flip_Result
Yes    494
No     306
Name: count, dtype: int64


In [219]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# --- Focal Loss ---
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        inputs = inputs.clamp(min=1e-7, max=1 - 1e-7)
        BCE = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.where(targets == 1, inputs, 1 - inputs)
        loss = self.alpha * (1 - pt) ** self.gamma * BCE
        return loss.mean()

# --- Neural Network ---
class FlipNet(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.drop1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.drop2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.drop3 = nn.Dropout(0.2)

        self.out = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.drop3(x)
        return torch.sigmoid(self.out(x))

# --- Step 1: Preprocessing ---
X_train_np = X_train.values.astype(np.float32)
y_train_np = y_train.values.astype(np.float32).reshape(-1, 1)
X_test_np = X_test.values.astype(np.float32)

X_tr, X_val, y_tr, y_val = train_test_split(X_train_np, y_train_np, test_size=0.15, random_state=42)

train_ds = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr))
val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
test_tensor = torch.tensor(X_test_np)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# --- Step 2: Training Setup ---
model = FlipNet(X_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
loss_fn = FocalLoss(alpha=0.15, gamma=2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5, verbose=True)

best_val_acc = 0
patience = 0
patience_limit = 15

# --- Step 3: Training Loop ---
for epoch in range(200):
    model.train()
    for xb, yb in train_loader:
        # Add tiny input noise for regularization
        noise = torch.randn_like(xb) * 0.01
        preds = model(xb + noise)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # --- Step 4: Validation ---
    model.eval()
    with torch.no_grad():
        val_preds = model(torch.tensor(X_val))
        val_pred_labels = (val_preds.numpy() > 0.5).astype(int)
        acc = accuracy_score(y_val, val_pred_labels)

    print(f"Epoch {epoch+1}: Val Accuracy = {acc:.4f}")
    scheduler.step(acc)

    if acc > best_val_acc:
        best_val_acc = acc
        best_model_state = model.state_dict()
        patience = 0
    else:
        patience += 1
        if patience >= patience_limit:
            print("✅ Early stopping triggered.")
            break

# --- Step 5: Load best model and predict on test set ---
model.load_state_dict(best_model_state)
model.eval()

# Enable MC Dropout during prediction for robustness
def predict_mc_dropout(model, x_tensor, n=10):
    model.train()  # force dropout ON
    preds = [model(x_tensor).detach().numpy().flatten() for _ in range(n)]
    return np.mean(preds, axis=0)

with torch.no_grad():
    test_probs = predict_mc_dropout(model, test_tensor, n=10)

# --- Step 6: Threshold tuning ---
threshold = 0.46  # tune this!
test_labels = ['Yes' if p > threshold else 'No' for p in test_probs]

# --- Step 7: Create submission.csv ---
submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],
    'Flip_Result': test_labels
})

submission.to_csv('submission.csv', index=False)
print("✅ Neural network submission.csv created successfully!")
print(submission['Flip_Result'].value_counts())




Epoch 1: Val Accuracy = 0.9729
Epoch 2: Val Accuracy = 0.9688
Epoch 3: Val Accuracy = 0.9667
Epoch 4: Val Accuracy = 0.9354
Epoch 5: Val Accuracy = 0.9896
Epoch 6: Val Accuracy = 0.9875
Epoch 7: Val Accuracy = 0.9875
Epoch 8: Val Accuracy = 0.9875
Epoch 9: Val Accuracy = 0.9250
Epoch 10: Val Accuracy = 0.9896
Epoch 11: Val Accuracy = 0.9771
Epoch 12: Val Accuracy = 0.9333
Epoch 13: Val Accuracy = 0.9875
Epoch 14: Val Accuracy = 0.9875
Epoch 15: Val Accuracy = 0.9896
Epoch 16: Val Accuracy = 0.9896
Epoch 17: Val Accuracy = 0.9875
Epoch 18: Val Accuracy = 0.9854
Epoch 19: Val Accuracy = 0.9854
Epoch 20: Val Accuracy = 0.9833
✅ Early stopping triggered.
✅ Neural network submission.csv created successfully!
Flip_Result
Yes    472
No     328
Name: count, dtype: int64


### XGBOOST

In [220]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Prepare XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# Set parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'eta': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1,
    'alpha': 0,
    'seed': 42,
    
}

# Train the model
booster = xgb.train(params, dtrain, num_boost_round=200)

# Predict
test_probs = booster.predict(dtest)
test_preds = ['Yes' if p > 0.5 else 'No' for p in test_probs]

# Create submission
submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],
    'Flip_Result': test_preds
})
submission.to_csv('submission_xgboost.csv', index=False)
print("✅ XGBoost submission created!")
print(submission['Flip_Result'].value_counts())


✅ XGBoost submission created!
Flip_Result
Yes    465
No     335
Name: count, dtype: int64


In [221]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

#############################################
#           XGBoost Model Section           #
#############################################

# Prepare data for XGBoost
X_train_np = X_train.values.astype(np.float32)
# Fix: Convert y_train to a NumPy array before reshaping
y_train_np = y_train.values.astype(np.float32).reshape(-1, 1)
X_test_np = X_test.values.astype(np.float32)

dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dtest = xgb.DMatrix(X_test_np)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'eta': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1,
    'alpha': 0,
    'seed': 42,
}
booster = xgb.train(params, dtrain, num_boost_round=200)
xgb_probs = booster.predict(dtest)

#############################################
#        Neural Network Model Section       #
#############################################

# --- Focal Loss ---
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.15, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, inputs, targets):
        inputs = inputs.clamp(min=1e-7, max=1 - 1e-7)
        BCE = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.where(targets == 1, inputs, 1 - inputs)
        loss = self.alpha * (1 - pt) ** self.gamma * BCE
        return loss.mean()

# --- Neural Network Architecture ---
class FlipNet(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.drop1 = nn.Dropout(0.3)
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.drop2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.drop3 = nn.Dropout(0.2)
        
        self.out = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.drop3(x)
        return torch.sigmoid(self.out(x))

# Prepare data for the NN using the same NumPy arrays from above
X_train_nn = X_train_np.copy()
y_train_nn = y_train_np.copy()
X_test_nn = X_test_np.copy()

# Split training data into train/validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X_train_nn, y_train_nn, test_size=0.15, random_state=42)

# Create PyTorch datasets and loaders
train_ds = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr))
val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
test_tensor = torch.tensor(X_test_nn)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# Initialize model, optimizer, loss function, and scheduler
model = FlipNet(X_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
loss_fn = FocalLoss(alpha=0.15, gamma=2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5, verbose=True)

best_val_acc = 0
patience = 0
patience_limit = 18

# Training loop
for epoch in range(200):
    model.train()
    for xb, yb in train_loader:
        # Add a small amount of noise for regularization
        noise = torch.randn_like(xb) * 0.01
        preds = model(xb + noise)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation phase
    model.eval()
    with torch.no_grad():
        val_preds = model(torch.tensor(X_val))
        val_pred_labels = (val_preds.numpy() > 0.5).astype(int)
        acc = accuracy_score(y_val, val_pred_labels)
    print(f"Epoch {epoch+1}: NN Val Accuracy = {acc:.4f}")
    scheduler.step(acc)
    
    if acc > best_val_acc:
        best_val_acc = acc
        best_model_state = model.state_dict()
        patience = 0
    else:
        patience += 1
        if patience >= patience_limit:
            print("NN Early stopping triggered.")
            break

model.load_state_dict(best_model_state)
model.eval()

# Enable MC Dropout for robust NN inference
def predict_mc_dropout(model, x_tensor, n=10):
    model.train()  # force dropout active during prediction
    preds = [model(x_tensor).detach().numpy().flatten() for _ in range(n)]
    return np.mean(preds, axis=0)

with torch.no_grad():
    nn_probs = predict_mc_dropout(model, test_tensor, n=10)

#############################################
#         Ensemble Predictions Section      #
#############################################

# Average the probabilities from XGBoost and the NN
ensemble_probs = (xgb_probs*(0.95) + nn_probs*(1.05)) / 2
threshold = 0.47  # Adjust the threshold as needed
ensemble_preds = ['Yes' if p > threshold else 'No' for p in ensemble_probs]

#############################################
#         Submission CSV Generation         #
#############################################

submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],  # Ensure test_encoded has the 'Throw_IDs' column
    'Flip_Result': ensemble_preds
})
submission.to_csv('submission_ensemble.csv', index=False)
print("✅ Ensemble submission created!")
print(submission['Flip_Result'].value_counts())




Epoch 1: NN Val Accuracy = 0.9771
Epoch 2: NN Val Accuracy = 0.9667
Epoch 3: NN Val Accuracy = 0.9646
Epoch 4: NN Val Accuracy = 0.9250
Epoch 5: NN Val Accuracy = 0.9563
Epoch 6: NN Val Accuracy = 0.9458
Epoch 7: NN Val Accuracy = 0.9833
Epoch 8: NN Val Accuracy = 0.9875
Epoch 9: NN Val Accuracy = 0.9667
Epoch 10: NN Val Accuracy = 0.9896
Epoch 11: NN Val Accuracy = 0.9667
Epoch 12: NN Val Accuracy = 0.9375
Epoch 13: NN Val Accuracy = 0.9125
Epoch 14: NN Val Accuracy = 0.9167
Epoch 15: NN Val Accuracy = 0.9292
Epoch 16: NN Val Accuracy = 0.9854
Epoch 17: NN Val Accuracy = 0.9854
Epoch 18: NN Val Accuracy = 0.9688
Epoch 19: NN Val Accuracy = 0.9833
Epoch 20: NN Val Accuracy = 0.9854
Epoch 21: NN Val Accuracy = 0.9833
Epoch 22: NN Val Accuracy = 0.9854
Epoch 23: NN Val Accuracy = 0.9875
Epoch 24: NN Val Accuracy = 0.9833
Epoch 25: NN Val Accuracy = 0.9854
Epoch 26: NN Val Accuracy = 0.9854
Epoch 27: NN Val Accuracy = 0.9854
Epoch 28: NN Val Accuracy = 0.9875
NN Early stopping triggered.


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

#############################################
#           Data Preparation              #
#############################################

# Convert data to NumPy arrays (ensuring float32 for features and proper shape for target)
X_train_np = X_train.values.astype(np.float32)
y_train_np = y_train.values.astype(np.float32).reshape(-1, 1)  # Fixed: use .values then reshape
X_test_np  = X_test.values.astype(np.float32)

# XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'eta': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1,
    'alpha': 0,
    'seed': 42,
}

#############################################
#         Neural Network Setup            #
#############################################

# Focal Loss definition
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.15, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, inputs, targets):
        inputs = inputs.clamp(min=1e-7, max=1 - 1e-7)
        BCE = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.where(targets == 1, inputs, 1 - inputs)
        loss = self.alpha * (1 - pt) ** self.gamma * BCE
        return loss.mean()

# Neural network architecture (FlipNet)
class FlipNet(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.drop1 = nn.Dropout(0.3)
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.drop2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.drop3 = nn.Dropout(0.2)
        
        self.out = nn.Linear(32, 1)
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.drop3(x)
        return torch.sigmoid(self.out(x))

# Function to train the NN on a given fold
def train_nn_on_fold(X_train_fold, y_train_fold, input_size, epochs=50):
    X_train_fold_tensor = torch.tensor(X_train_fold)
    y_train_fold_tensor = torch.tensor(y_train_fold)
    dataset = TensorDataset(X_train_fold_tensor, y_train_fold_tensor)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)
    
    model = FlipNet(input_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    loss_fn = FocalLoss(alpha=0.15, gamma=2)
    
    best_loss = np.inf
    best_model_state = None
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for xb, yb in loader:
            noise = torch.randn_like(xb) * 0.01
            preds = model(xb + noise)
            loss = loss_fn(preds, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(loader)
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_model_state = model.state_dict()
    model.load_state_dict(best_model_state)
    return model

# Function for Monte Carlo Dropout predictions
def predict_mc_dropout(model, x_tensor, n=10):
    model.train()  # force dropout to be active
    preds = [model(x_tensor).detach().numpy().flatten() for _ in range(n)]
    return np.mean(preds, axis=0)

#############################################
#       Out-of-Fold Stacking Setup        #
#############################################

n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
# Prepare an array to hold out-of-fold predictions for both models
oof_preds = np.zeros((X_train_np.shape[0], 2))  # column 0: XGBoost; column 1: NN

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_np, y_train_np.ravel())):
    print(f"Fold {fold+1}")
    X_tr_fold = X_train_np[train_idx]
    y_tr_fold = y_train_np[train_idx]
    X_val_fold = X_train_np[val_idx]
    y_val_fold = y_train_np[val_idx]
    
    # XGBoost for current fold
    dtrain_fold = xgb.DMatrix(X_tr_fold, label=y_tr_fold)
    booster_fold = xgb.train(params, dtrain_fold, num_boost_round=200)
    oof_preds[val_idx, 0] = booster_fold.predict(xgb.DMatrix(X_val_fold))
    
    # Neural Network for current fold
    nn_model_fold = train_nn_on_fold(X_tr_fold, y_tr_fold, input_size=X_train_np.shape[1], epochs=75)
    X_val_fold_tensor = torch.tensor(X_val_fold)
    oof_preds[val_idx, 1] = predict_mc_dropout(nn_model_fold, X_val_fold_tensor, n=10)

# Train a meta-learner (logistic regression) on the out-of-fold predictions
meta_model = LogisticRegression(max_iter=1000)
meta_model.fit(oof_preds, y_train_np.ravel())

#############################################
#    Train Base Models on Full Training   #
#############################################

# XGBoost on full training data
booster_full = xgb.train(params, xgb.DMatrix(X_train_np, label=y_train_np), num_boost_round=200)
xgb_test_probs = booster_full.predict(xgb.DMatrix(X_test_np))

# Neural Network on full training data
nn_model_full = train_nn_on_fold(X_train_np, y_train_np, input_size=X_train_np.shape[1], epochs=75)
nn_test_probs = predict_mc_dropout(nn_model_full, torch.tensor(X_test_np), n=10)

# Stack the test predictions as features for the meta-learner
stacked_test = np.column_stack((xgb_test_probs, nn_test_probs))
meta_test_probs = meta_model.predict_proba(stacked_test)[:, 1]

threshold = 0.48  # Adjust the threshold as needed
final_preds = ['Yes' if p > threshold else 'No' for p in meta_test_probs]

#############################################
#       Generate Final Submission CSV       #
#############################################

submission = pd.DataFrame({
    'Throw_IDs': test_encoded['Throw_IDs'],  # Ensure this DataFrame has the 'Throw_IDs'
    'Flip_Result': final_preds
})
submission.to_csv('submission_stacked_ensemble.csv', index=False)
print("✅ Stacked ensemble submission created!")
print(submission['Flip_Result'].value_counts())


Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10
✅ Stacked ensemble submission created!
Flip_Result
Yes    466
No     334
Name: count, dtype: int64
