In [1]:
#Libraries

import json
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Getting the training data:
train_data = []
with open('train.jsonl', 'r') as file:
    for line in file:
        train_data.append(json.loads(line))

print(f"Successfully loaded {len(train_data)} battles")

# Get the test data:
test_data = []
with open('test.jsonl', 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
print(f"Successfully loaded {len(test_data)} battles")

# Looking into the data:
first_battle = train_data[0]    #first entire row

# Copy the first battle and truncate the timeline for better display of data
battle_for_display = first_battle.copy()
battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:4] #first 4 turns

# json.dumps for cleaner printing
print(json.dumps(battle_for_display, indent=4))

Successfully loaded 10000 battles
Successfully loaded 5000 battles
{
    "player_won": true,
    "p1_team_details": [
        {
            "name": "starmie",
            "level": 100,
            "types": [
                "psychic",
                "water"
            ],
            "base_hp": 60,
            "base_atk": 75,
            "base_def": 85,
            "base_spa": 100,
            "base_spd": 100,
            "base_spe": 115
        },
        {
            "name": "exeggutor",
            "level": 100,
            "types": [
                "grass",
                "psychic"
            ],
            "base_hp": 95,
            "base_atk": 95,
            "base_def": 85,
            "base_spa": 125,
            "base_spd": 125,
            "base_spe": 55
        },
        {
            "name": "chansey",
            "level": 100,
            "types": [
                "normal",
                "notype"
            ],
            "base_hp": 250,
            "base_atk": 5

In [3]:
# Feature selection method
def feature_extraction(data:list[dict]) -> pd.DataFrame:
    
    """
    from the baseline (static features):
    a). Critical hits!

    from the timeline(dynamic features): 
    1. p1_team mean damage and p2_team mean damage 
    2. p1 and p2 team mean hp during the 30 rounds
    3. p1_team usage of induced change of status in the opponent's pokemon
    """

    winning_features = []
    for battle in data:
        features = {}   # dictionary type
        
        # a) --- Critical hits! ---
        # We calculate critical hits potential based on p1 pokemon speed stats
        p1_team = battle.get('p1_team_details', [])
        if p1_team:
            team_speeds = [s.get('base_spd', 0) for s in p1_team]
            # Gen 1 critical hits calculation: T = base_speed/2, critical hits occur if random(0,255) < T
            crit_rates = []
            for speed in team_speeds:
                T = speed/2
                crit_probability = min(T/256, 1.0)  # probability = T/256
                crit_rates.append(crit_probability)
            
            # critical hit features:
            features['team_max_crit_rate'] = max(crit_rates) if crit_rates else 0  # because a critical hit is usually a rare event!
            features['team_avg_crit_rate'] = sum(crit_rates)/len(crit_rates) if crit_rates else 0

            # Critical hit by a pokemon by percentage (0-100)%
            features['best_crit_chance_%'] = max(crit_rates)*100 if crit_rates else 0
            

        # --- DYNAMIC FEATURES --- 
        timeline = battle.get('battle_timeline', [])

        #
        q1=0
        q2=0
        p1_names = []
        p2_names = []
        
        for i in range(len(timeline)):
            x = timeline[i].get('p1_move_details')
            if (x):
                q1+=(x.get("base_power",0)*x.get("accuracy",0))
            else:
                q1+=0

            x = timeline[i].get('p2_move_details')
            if (x):
                q2+=(x.get("base_power",0)*x.get("accuracy",0))
            else:
                q2+=0

            x = timeline[i].get('p1_pokemon_state')
            if (x):
                p1name = x.get("name")
                
            x = timeline[i].get('p2_pokemon_state')
            if (x):
                p2name = x.get("name")
            p1_names.append(p1name)
            p2_names.append(p2name)
        
        p1_names = list(set(p1_names))
        p2_names = list(set(p2_names))
        
        p1hp = {}
        p2hp = {}
        for el in p1_names:
            p1hp[el] = 100
        for el in p2_names:
            p2hp[el] = 100
        
        # 1. --- p1_team mean damage and p2_team mean damage ---
        for i in range(len(timeline)):
            x = timeline[i].get('p1_pokemon_state')
            if (x):
                p1hp[x.get("name")] = x.get("hp_pct")*100
            x = timeline[i].get('p2_pokemon_state')
            if (x):
                p2hp[x.get("name")] = x.get("hp_pct")*100
            
        q1/=len(timeline)
        features['p1_mean_damage'] = q1

        q2/=len(timeline)
        features['p2_mean_damage'] = q2 

        # 2. --- p1 and p2 team mean hp during the 30 rounds --- 
        vl1 = 0
        for el in p1hp.keys():
            vl1 += p1hp[el] 

        vl2 = 0
        for el in p2hp.keys():
            vl2 += p2hp[el] 

        vl1 = vl1 + (6-len(p1hp.keys()))*100
        vl2 = vl2 + (6-len(p2hp.keys()))*100

        vl1/=6
        vl2/=6

        features['p1_r30_mean_hpt'] = vl1
        features['p2_r30_mean_hpt'] = vl2

        # initialize counter:
        change_of_status = 0
        # 3. --- p1_team usage of induced change of status in the opponent's pokemon ---
        for event in timeline:
            if isinstance(event, dict):
                if 'p2_pokemon_state' in event:
                    p2_state = event.get('p2_pokemon_state')
                    if isinstance(p2_state, dict):
                        current_status = p2_state.get('status', '')
                        if current_status != 'nostatus':
                            change_of_status += 1
        features['n_change_of_status_induced'] = change_of_status

        winning_features.append(features)
        # Battle id and target variable:
        features['battle_id'] = battle.get('battle_id')
        if 'player_won' in battle:
            features['player_won'] = int(battle['player_won'])
    
    return pd.DataFrame(winning_features).fillna(0)

In [4]:
train_df = feature_extraction(train_data)
test_df = feature_extraction(test_data)
display(train_df.head(10))

Unnamed: 0,team_max_crit_rate,team_avg_crit_rate,best_crit_chance_%,p1_mean_damage,p2_mean_damage,p1_r30_mean_hpt,p2_r30_mean_hpt,n_change_of_status_induced,battle_id,player_won
0,0.263672,0.195312,26.367188,51.833333,35.866667,64.5469,44.125036,17,0,1
1,0.244141,0.175781,24.414062,64.7,47.666667,26.333333,42.833333,5,1,1
2,0.253906,0.175781,25.390625,31.7,38.033333,69.666667,69.333333,14,2,1
3,0.253906,0.201823,25.390625,51.833333,71.241667,34.0,47.666667,5,3,1
4,0.263672,0.19043,26.367188,30.3,38.833333,62.666667,52.5,22,4,1
5,0.263672,0.195312,26.367188,45.166667,55.9,71.166667,54.666667,6,5,1
6,0.263672,0.193685,26.367188,59.766667,48.0,26.5,48.333333,13,6,1
7,0.263672,0.201823,26.367188,10.166667,40.7,56.833333,88.5,2,7,1
8,0.263672,0.193685,26.367188,44.0,64.133333,27.666667,59.166667,20,8,1
9,0.263672,0.203451,26.367188,44.066667,45.033333,43.326529,56.47691,7,9,1


In [52]:
#spliting train file into parts to preview accuracy
features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]
X_features = train_df[features]
Y_target = train_df['player_won']

X_train, X_validation, y_train, y_validation = train_test_split(
    X_features, Y_target, 
    test_size = 0.25, 
    random_state = 42,
    stratify = Y_target
)

model = RandomForestClassifier(
    n_estimators = 140,
    max_depth= 9,
    min_samples_split = 10,
    min_samples_leaf = 5,
    random_state = 42,
    n_jobs =-1
)

print("Training model...")
model.fit(X_train, y_train)

validation_predictions = model.predict(X_validation)
validation_accuracy = accuracy_score(y_validation, validation_predictions)

print(f"Validation Accuracy: {validation_accuracy:.4f}")

Training model...
Validation Accuracy: 0.8224


In [53]:
print("Generating predictions on the test set...")
X_test = test_df[features]
test_predictions = model.predict(X_test)

q = len(test_predictions)

ids = [i for i in range(q)]

submission_df = pd.DataFrame({
    'battle_id': ids,
    'player_won': test_predictions
})

#our submission file 
submission_df.to_csv('submission.csv', index=False)

print("\n'submission.csv' file created successfully!")
display(submission_df.head())

Generating predictions on the test set...

'submission.csv' file created successfully!


Unnamed: 0,battle_id,player_won
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
