# Create and Test Models

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("../data/scaled_final_data.csv")

X = df.drop(columns=["is_hit"])
y = df["is_hit"]

X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_log))


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.78      0.99      0.88       846
           1       0.17      0.00      0.01       235

    accuracy                           0.78      1081
   macro avg       0.47      0.50      0.44      1081
weighted avg       0.65      0.78      0.69      1081



In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Report:
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       846
           1       0.30      0.25      0.27       235

    accuracy                           0.71      1081
   macro avg       0.55      0.54      0.54      1081
weighted avg       0.69      0.71      0.70      1081



In [17]:
from sklearn.metrics import accuracy_score

log_accuracy = accuracy_score(y_test, y_pred_log)

print("Logistic Regression Accuracy:", log_accuracy)

rf_accuracy = accuracy_score(y_test, y_pred_rf)

print("Random Forest Accuracy:", rf_accuracy)

Logistic Regression Accuracy: 0.7789084181313598
Random Forest Accuracy: 0.7095282146160962


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Convert data to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Create dataset and dataloader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the model
class HitPredictor(nn.Module):
    def __init__(self, input_dim):
        super(HitPredictor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model = HitPredictor(input_dim=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    for xb, yb in train_loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Evaluate
with torch.no_grad():
    preds = model(X_test_tensor).round()
    acc = (preds.eq(y_test_tensor)).float().mean()
    print(f"Neural Net Accuracy: {acc.item():.4f}")


Epoch 1/10, Loss: 0.3767
Epoch 2/10, Loss: 0.2923
Epoch 3/10, Loss: 0.4636
Epoch 4/10, Loss: 0.2993
Epoch 5/10, Loss: 0.3523
Epoch 6/10, Loss: 0.3946
Epoch 7/10, Loss: 0.5327
Epoch 8/10, Loss: 0.3252
Epoch 9/10, Loss: 0.3681
Epoch 10/10, Loss: 0.3731
Neural Net Accuracy: 0.7669
