In [1]:
import numpy as np
import pandas as pd
import csv
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [37]:
#Open the df of single file since it is very large (2,000,000 games)
tsv_filename = "/Users/jojod/Desktop/MAIS202/data/S18/raw_data/battlesStaging_12272020_WL_tagged.csv"
chunk_size = 500000

text_file_reader = pd.read_csv(tsv_filename, engine='python',encoding='utf-8-sig', quoting=csv.QUOTE_MINIMAL, chunksize = chunk_size)

dfList = []
counter = 0

for df in text_file_reader:
    dfList.append(df)
    counter= counter +1
    print("Max rows read: " + str(chunk_size * counter) )

df = pd.concat(dfList,sort=False)

Max rows read: 500000
Max rows read: 1000000
Max rows read: 1500000
Max rows read: 2000000


In [2]:
###Obtain win margin

#New columns to create:
#winner.hp
#loser.hp
#hp.difference
#crown.difference
#win.margin using sigmoid function and normalized value

# Define the function to handle NaN and sum the values in the list
def sum_princess_hp(princess_hp):
    # If it's NaN, return 0; otherwise, sum the values in the list
    if isinstance(princess_hp, list):
        return sum([float(i) for i in princess_hp])  # sum the values in the list
    return 0  # Return 0 for NaN or other invalid cases

#Replace NaN by 0 HP
df['winner.kingTowerHitPoints'] = df['winner.kingTowerHitPoints'].fillna(0)
df['loser.kingTowerHitPoints'] = df['loser.kingTowerHitPoints'].fillna(0)
df['winner.princessTowersHitPoints'] = df['winner.princessTowersHitPoints'].fillna(0)
df['loser.princessTowersHitPoints'] = df['loser.princessTowersHitPoints'].fillna(0)

df['winner.hp'] = df.apply(lambda row: float(row['winner.kingTowerHitPoints']) + sum_princess_hp(row['winner.princessTowersHitPoints']), axis=1)
df['loser.hp'] = df.apply(lambda row: float(row['loser.kingTowerHitPoints']) + sum_princess_hp(row['loser.princessTowersHitPoints']), axis=1)
df['hp.difference'] = df['winner.hp'] - df['loser.hp'] #Float, can be negative if loser finish with more HP
df['crown.difference'] = df['winner.crowns'] - df['loser.crowns'] #Float

hp_mean = df['hp.difference'].mean()
hp_sd = df['hp.difference'].std()

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def win_margin(df, crown_dif, hp_dif, w=0.7):
    # Normalize crown difference using min-max
    norm_crown_dif = (crown_dif - 1) / 2  # Can be 0, 0.5, or 1
    # Normalize hp difference using z-score
    norm_hp_dif = sigmoid((hp_dif - hp_mean) / hp_sd)  # Normal distribution around 0
    #Define sigmoid function
    return w*norm_crown_dif + (1-w)*norm_hp_dif  # This is an example; adjust logic as needed.

df['win.margin'] = df.apply(lambda row: win_margin(df, row['crown.difference'], row['hp.difference']), axis=1) #Float

# Example output check
print(df[['winner.hp', 'loser.hp', 'hp.difference', 'crown.difference', 'win.margin']].sample(n=30))

df.to_csv('/Users/jojod/Desktop/MAIS202/data/S18/win_margin_battlesStaging_12272020_WL_tagged.csv', index=False)

NameError: name 'df' is not defined

In [39]:
#Pre-processing by shuffling winner/loser order and adding winner_value column to be predicted
df_mod = df[[
 'winner.card1.id',
 'winner.card2.id',
 'winner.card3.id',
 'winner.card4.id',
 'winner.card5.id',
 'winner.card6.id',
 'winner.card7.id',
 'winner.card8.id',
 'winner.totalcard.level',
 'winner.elixir.average',
 'loser.card1.id',
 'loser.card2.id',
 'loser.card3.id',
 'loser.card4.id',
 'loser.card5.id',
 'loser.card6.id',
 'loser.card7.id',
 'loser.card8.id',
 'loser.totalcard.level',
 'loser.elixir.average']].copy()

#Rename winner and loser by player 1 and 2 
df_mod.rename(columns={col: col.replace('winner', 'player1') for col in df_mod.columns}, inplace=True)
df_mod.rename(columns={col: col.replace('loser', 'player2') for col in df_mod.columns}, inplace=True)

#Add predicted value
df_mod['winner.value'] = np.random.choice([0, 1], size=len(df_mod))

column_pairs = [
        ('player1.card1.id', 'player2.card1.id'),
        ('player1.card2.id', 'player2.card2.id'),
        ('player1.card3.id', 'player2.card3.id'),
        ('player1.card4.id', 'player2.card4.id'),
        ('player1.card5.id', 'player2.card5.id'),
        ('player1.card6.id', 'player2.card6.id'),
        ('player1.card7.id', 'player2.card7.id'),
        ('player1.card8.id', 'player2.card8.id'),
        ('player1.totalcard.level', 'player2.totalcard.level'),
        ('player1.elixir.average', 'player2.elixir.average')
        ]

# Create a mask for when 'winner.value' is 1 (indicating player 2 is the winner)
mask = df_mod['winner.value'] == 1

# Efficiently swap the columns for the rows where 'winner.value' is 1
for col1, col2 in column_pairs:
    df_mod.loc[mask, col1], df_mod.loc[mask, col2] = df_mod.loc[mask, col2], df_mod.loc[mask, col1]

print(df_mod.sample(n=10))
df_mod.to_csv('/Users/jojod/Desktop/MAIS202/data/S18/pre_prep_battlesStaging_12272020_WL_tagged.csv', index=False)

         player1.card1.id  player1.card2.id  player1.card3.id  \
1623486          26000055          26000051          26000017   
11754            26000007          26000006          26000020   
296545           28000008          26000021          26000041   
1414766          26000060          26000019          26000041   
1804906          26000083          28000002          26000064   
1410933          26000033          26000016          27000007   
1472231          28000004          28000011          28000003   
1250270          26000009          26000022          28000001   
1262250          28000008          26000021          26000011   
1392718          26000041          28000009          26000004   

         player1.card4.id  player1.card5.id  player1.card6.id  \
1623486          26000011          28000000          28000008   
11754            28000002          26000018          26000017   
296545           26000064          26000014          26000055   
1414766          2600004

In [5]:
#Open the pre-processed data
#Open the df of single file since it is very large (2,000,000 games)
tsv_filename = "/Users/jojod/Desktop/MAIS202/data/S18/pre_prep_battlesStaging_12272020_WL_tagged.csv"
chunk_size = 500000

text_file_reader = pd.read_csv(tsv_filename, engine='python',encoding='utf-8-sig', quoting=csv.QUOTE_MINIMAL, chunksize = chunk_size)

dfList = []
counter = 0

for df in text_file_reader:
    dfList.append(df)
    counter= counter +1
    print("Max rows read: " + str(chunk_size * counter) )

df_prep = pd.concat(dfList,sort=False)
print(df_prep.columns.tolist())
print(df_prep.sample(10))

Max rows read: 500000
Max rows read: 1000000
Max rows read: 1500000
Max rows read: 2000000
['player1.card1.id', 'player1.card2.id', 'player1.card3.id', 'player1.card4.id', 'player1.card5.id', 'player1.card6.id', 'player1.card7.id', 'player1.card8.id', 'player1.totalcard.level', 'player1.elixir.average', 'player2.card1.id', 'player2.card2.id', 'player2.card3.id', 'player2.card4.id', 'player2.card5.id', 'player2.card6.id', 'player2.card7.id', 'player2.card8.id', 'player2.totalcard.level', 'player2.elixir.average', 'winner.value']
         player1.card1.id  player1.card2.id  player1.card3.id  \
872099           26000017          26000055          26000000   
1689945          26000022          26000042          26000020   
1881175          26000043          26000000          26000017   
1332118          27000005          26000044          26000022   
98962            26000011          26000037          26000010   
907825           26000023          28000008          26000008   
1529191    

In [12]:
#Implement logistic regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA

# Define features (X) and target (y)
X = df_prep.drop(columns=['winner.value'])
y = df_prep['winner.value']

#Separate categorical values of card.id from numerical values of average.elixir and total.level
card_columns = [
    'player1.card1.id', 'player1.card2.id', 'player1.card3.id', 'player1.card4.id', 
    'player1.card5.id', 'player1.card6.id', 'player1.card7.id', 'player1.card8.id',
    'player2.card1.id', 'player2.card2.id', 'player2.card3.id', 'player2.card4.id', 
    'player2.card5.id', 'player2.card6.id', 'player2.card7.id', 'player2.card8.id'
    ]
numerical_columns = ['player1.totalcard.level', 'player1.elixir.average', 'player2.totalcard.level', 'player2.elixir.average']

# Preprocessing pipeline
# Step 1: OneHotEncode card columns
# Step 2: Normalize continuous columns
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(), card_columns),  # One-hot encode card columns
        ('continuous', StandardScaler(), numerical_columns)  # Normalize continuous columns
    ])

# Split the data into training and testing sets (e.g., 80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing and logistic regression in a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))



Accuracy: 0.577951034264507
Confusion Matrix:
[[109449  81825]
 [ 79545 111530]]
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.57      0.58    191274
           1       0.58      0.58      0.58    191075

    accuracy                           0.58    382349
   macro avg       0.58      0.58      0.58    382349
weighted avg       0.58      0.58      0.58    382349



In [15]:
##ChatGPT implementation of FNN

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# ----------------------
# 1. Dataset Preparation
# ----------------------
class CardDataset(Dataset):
    def __init__(self, df, card_columns, numerical_columns):
        self.card_columns = card_columns
        self.numerical_columns = numerical_columns
        
        # Convert card ID columns using ordinal encoding
        encoder = OrdinalEncoder(dtype=np.int64)
        self.card_data = encoder.fit_transform(df[card_columns])
        self.num_cards = int(self.card_data.max()) + 1  # Total unique cards
        
        # Normalize numerical columns
        scaler = StandardScaler()
        self.numerical_data = scaler.fit_transform(df[numerical_columns])
        
        # Target variable
        self.y = df['winner.value'].values.astype(np.float32)
        
        # Convert to tensors
        self.card_data = torch.tensor(self.card_data, dtype=torch.long)
        self.numerical_data = torch.tensor(self.numerical_data, dtype=torch.float32)
        self.y = torch.tensor(self.y, dtype=torch.float32).unsqueeze(1)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.card_data[idx], self.numerical_data[idx], self.y[idx]

# ----------------------
# 2. FNN Model Definition
# ----------------------
class FNNClassifier(nn.Module):
    def __init__(self, num_cards, embed_dim, num_numerical, hidden_dim):
        super(FNNClassifier, self).__init__()
        
        # Embedding layer for card IDs
        self.embedding = nn.Embedding(num_cards, embed_dim)
        
        # Fully connected layers
        self.fc1 = nn.Linear(embed_dim * 16 + num_numerical, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)  # Binary classification output
        
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, card_ids, numerical_data):
        embedded = self.embedding(card_ids)  # Shape: (batch, 16, embed_dim)
        embedded = embedded.view(embedded.size(0), -1)  # Flatten embeddings
        
        x = torch.cat([embedded, numerical_data], dim=1)  # Concatenate with numerical features
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.sigmoid(x)

# ----------------------
# 3. Training & Evaluation
# ----------------------

def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        correct, total = 0, 0
        
        for card_data, num_data, labels in train_loader:
            card_data, num_data, labels = card_data.to(device), num_data.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(card_data, num_data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        train_acc = correct / total
        print(f'Epoch {epoch+1}: Loss = {train_loss/len(train_loader):.4f}, Accuracy = {train_acc:.4f}')
    
    return model

# ----------------------
# 4. Data Loading & Execution
# ----------------------

def main():
    df = df_prep  # Load your dataset
    
    card_columns = [
        'player1.card1.id', 'player1.card2.id', 'player1.card3.id', 'player1.card4.id',
        'player1.card5.id', 'player1.card6.id', 'player1.card7.id', 'player1.card8.id',
        'player2.card1.id', 'player2.card2.id', 'player2.card3.id', 'player2.card4.id',
        'player2.card5.id', 'player2.card6.id', 'player2.card7.id', 'player2.card8.id'
    ]
    numerical_columns = ['player1.totalcard.level', 'player1.elixir.average', 'player2.totalcard.level', 'player2.elixir.average']
    
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    train_dataset = CardDataset(train_df, card_columns, numerical_columns)
    val_dataset = CardDataset(val_df, card_columns, numerical_columns)
    
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    
    model = FNNClassifier(num_cards=train_dataset.num_cards, embed_dim=10, num_numerical=len(numerical_columns), hidden_dim=128)
    trained_model = train_model(model, train_loader, val_loader, epochs=10, lr=0.001)
    
if __name__ == "__main__":
    main()

Epoch 1: Loss = 0.6674, Accuracy = 0.5749
Epoch 2: Loss = 0.6589, Accuracy = 0.5917
Epoch 3: Loss = 0.6558, Accuracy = 0.5977
Epoch 4: Loss = 0.6536, Accuracy = 0.6022
Epoch 5: Loss = 0.6519, Accuracy = 0.6054
Epoch 6: Loss = 0.6508, Accuracy = 0.6068
Epoch 7: Loss = 0.6500, Accuracy = 0.6083
Epoch 8: Loss = 0.6494, Accuracy = 0.6090
Epoch 9: Loss = 0.6487, Accuracy = 0.6103
Epoch 10: Loss = 0.6484, Accuracy = 0.6104
