In [86]:
import json
import pandas as pd
import os
from tqdm.notebook import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [87]:
train_data = []

print("loading train data...")

try:
    with open("train.jsonl", 'r') as f:
        for l in f:
            train_data.append(json.loads(l))
    print(f"Successfully loaded {len(train_data)} battles.")

except FileNotFoundError:
    print("train file not found")

test_data = []

print("loading test data...")

try:
    with open("test.jsonl", 'r') as f:
        for l in f:
            test_data.append(json.loads(l))
    print(f"Successfully loaded {len(test_data)} test-battles.")
except FileNotFoundError:
    print("test file not found")

loading train data...
Successfully loaded 10000 battles.
loading test data...
Successfully loaded 5000 test-battles.


In [63]:
def create_simple_features(data: list[dict]) -> pd.DataFrame:
    feature_list = []
    for battle in tqdm(data, desc="Extracting features"):
        features = {}
        
        #p1 features
        timeline = battle.get('battle_timeline')
        q1=0
        q2=0
        p1_names = []
        p2_names = []
        
        for i in range(len(timeline)):
            x = timeline[i].get('p1_move_details')
            if (x):
                q1+=(x.get("base_power",0)*x.get("accuracy",0))
            else:
                q1+=0

            x = timeline[i].get('p2_move_details')
            if (x):
                q2+=(x.get("base_power",0)*x.get("accuracy",0))
            else:
                q2+=0

            x = timeline[i].get('p1_pokemon_state')
            if (x):
                p1name = x.get("name")
                
            x = timeline[i].get('p2_pokemon_state')
            if (x):
                p2name = x.get("name")
            p1_names.append(p1name)
            p2_names.append(p2name)
            #we get names of used pokemons. Remember non used stay still. But they are 100% [Note that for game having 0 0 100 minght be better then 10 10 80]
        
        p1_names = list(set(p1_names))
        p2_names = list(set(p2_names))
        
        p1hp = {}
        p2hp = {}
        for el in p1_names:
            p1hp[el] = 100
        for el in p2_names:
            p2hp[el] = 100
            
        for i in range(len(timeline)):
            x = timeline[i].get('p1_pokemon_state')
            if (x):
                p1hp[x.get("name")] = x.get("hp_pct")*100
            x = timeline[i].get('p2_pokemon_state')
            if (x):
                p2hp[x.get("name")] = x.get("hp_pct")*100
            
        q1/=len(timeline)
        features['p1_mean_damage'] = q1

        q2/=len(timeline)
        features['p2_mean_damage'] = q2

        vl1 = 0
        for el in p1hp.keys():
            vl1 += p1hp[el] 

        vl2 = 0
        for el in p2hp.keys():
            vl2 += p2hp[el] 

        vl1 = vl1 + (6-len(p1hp.keys()))*100
        vl2 = vl2 + (6-len(p2hp.keys()))*100

        vl1/=6
        vl2/=6

        features['p1_r30_mean_hpt'] = vl1
        features['p2_r30_mean_hpt'] = vl2

        p1_team = battle.get('p1_team_details', [])
        if p1_team:
            features['p1_mean_hp'] = np.mean([p.get('base_hp', 0) for p in p1_team])
            features['p1_mean_spe'] = np.mean([p.get('base_spe', 0) for p in p1_team])
            features['p1_mean_atk'] = np.mean([p.get('base_atk', 0) for p in p1_team])
            features['p1_mean_def'] = np.mean([p.get('base_def', 0) for p in p1_team])

        p2_lead = battle.get('p2_lead_details')
        if p2_lead:
            features['p2_lead_hp'] = p2_lead.get('base_hp', 0)
            features['p2_lead_spe'] = p2_lead.get('base_spe', 0)
            features['p2_lead_atk'] = p2_lead.get('base_atk', 0)
            features['p2_lead_def'] = p2_lead.get('base_def', 0)
            
        feature_list.append(features)

        if 'player_won' in battle:
            features['player_won'] = int(battle['player_won'])
            
    return pd.DataFrame(feature_list).fillna(0)

# Create feature DataFrames for both training and test sets
print("Processing training data...")
train_df = create_simple_features(train_data)

print("\nProcessing test data...")
test_df = create_simple_features(test_data)

Processing training data...


Extracting features:   0%|          | 0/10000 [00:00<?, ?it/s]


Processing test data...


Extracting features:   0%|          | 0/5000 [00:00<?, ?it/s]

In [64]:
print("\nTraining features preview:")
display(train_df.head(10))


Training features preview:


Unnamed: 0,p1_mean_damage,p2_mean_damage,p1_r30_mean_hpt,p2_r30_mean_hpt,p1_mean_hp,p1_mean_spe,p1_mean_atk,p1_mean_def,p2_lead_hp,p2_lead_spe,p2_lead_atk,p2_lead_def,player_won
0,51.833333,35.866667,64.5469,44.125036,115.833333,80.0,72.5,63.333333,60,115,75,85,1
1,64.7,47.666667,26.333333,42.833333,123.333333,61.666667,72.5,65.833333,55,120,50,45,1
2,31.7,38.033333,69.666667,69.333333,124.166667,65.833333,84.166667,71.666667,250,50,5,5,1
3,51.833333,71.241667,34.0,47.666667,121.666667,75.833333,77.5,65.833333,75,110,100,95,1
4,30.3,38.833333,62.666667,52.5,114.166667,72.5,75.833333,79.166667,60,115,75,85,1
5,45.166667,55.9,71.166667,54.666667,103.333333,85.0,70.833333,70.0,55,120,50,45,1
6,59.766667,48.0,26.5,48.333333,74.166667,80.833333,89.166667,105.833333,60,115,75,85,1
7,10.166667,40.7,56.833333,88.5,89.166667,88.333333,86.666667,76.666667,65,130,65,60,1
8,44.0,64.133333,27.666667,59.166667,74.166667,80.833333,89.166667,105.833333,65,95,50,35,1
9,44.066667,45.033333,43.326529,56.47691,120.833333,77.5,75.0,63.333333,55,120,50,45,1


Training a simple Logistic Regression model...
Model training complete.


In [91]:
#spliting train file into parts to preview accuracy

X_train, X_test, y_train, y_test = train_test_split(
  train_df[features], train_df['player_won'] , random_state=104,test_size=0.25, shuffle=True)

print("Training a simple Logistic Regression model...")
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

print("ACC", model.score(X_test,y_test))
cm = confusion_matrix(y_test, model.predict(X_test))
cm

Training a simple Logistic Regression model...
ACC 0.814


array([[1033,  226],
       [ 239, 1002]])

In [92]:
# Training model on full data to get slightly better accuracy
features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]
X_train = train_df[features]
y_train = train_df['player_won']

X_test = test_df[features]
# Initialize and train the model
print("Training a simple Logistic Regression model...")
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
print("Model training complete.")

Training a simple Logistic Regression model...
Model training complete.


In [93]:
print("Generating predictions on the test set...")
test_predictions = model.predict(X_test)

q = len(test_predictions)

ids = [i for i in range(q)]

submission_df = pd.DataFrame({
    'battle_id': ids,
    'player_won': test_predictions
})

#our submission file 
submission_df.to_csv('submission.csv', index=False)

print("\n'submission.csv' file created successfully!")
display(submission_df.head())

Generating predictions on the test set...

'submission.csv' file created successfully!


Unnamed: 0,battle_id,player_won
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
