In [2]:
import pandas as pd
import numpy as np
from Feature_eng import *
import os

# Understanding the dataset

### Load the dataset

In [3]:
# Define path to your local data folder
DATA_PATH = os.path.join('Data')  # Assuming 'data' is in the same directory as your script

# Construct full paths to train and test files
train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
test_file_path = os.path.join(DATA_PATH, 'test.jsonl')

train_data = get_dict_from_json(train_file_path)
test_data = get_dict_from_json(test_file_path)

## How many different pokemons ?

### Train_data 

all the observed different pokemons in the p1 team and p2_lead

In [4]:
pokedex(train_data).drop_duplicates().sort_values('name').reset_index(drop=True)

Unnamed: 0,name,level,type1,type2,base_hp,base_atk,base_def,base_spa,base_spd,base_spe
0,alakazam,100,notype,psychic,55,50,45,135,135,120
1,articuno,100,flying,ice,90,85,100,125,125,85
2,chansey,100,normal,notype,250,5,5,105,105,50
3,charizard,100,fire,flying,78,84,78,85,85,100
4,cloyster,100,ice,water,50,95,180,85,85,70
5,dragonite,100,dragon,flying,91,134,95,100,100,80
6,exeggutor,100,grass,psychic,95,95,85,125,125,55
7,gengar,55,ghost,poison,60,65,60,130,130,110
8,gengar,100,ghost,poison,60,65,60,130,130,110
9,golem,100,ground,rock,80,110,130,55,55,45


Let's see if the opponents use the same pokemons

In [5]:
A=pokedex(train_data).drop_duplicates().sort_values('name').reset_index(drop=True)
A=set(A['name'])

B= opponents_pokemon(train_data).drop_duplicates().sort_values('name').reset_index(drop=True)
B=set(B['name'])

# lets see if B is subset of A
B.issubset(A)  # it is


True

we can obtain the stats of the opponent pokemon given only his name

### Test_data

Lets check the if also the test set has this property

In [6]:
pokedex(test_data).drop_duplicates().sort_values('name').reset_index(drop=True) 

Unnamed: 0,name,level,type1,type2,base_hp,base_atk,base_def,base_spa,base_spd,base_spe
0,alakazam,100,notype,psychic,55,50,45,135,135,120
1,alakazam,55,notype,psychic,55,50,45,135,135,120
2,articuno,100,flying,ice,90,85,100,125,125,85
3,chansey,100,normal,notype,250,5,5,105,105,50
4,charizard,100,fire,flying,78,84,78,85,85,100
5,cloyster,100,ice,water,50,95,180,85,85,70
6,dragonite,100,dragon,flying,91,134,95,100,100,80
7,exeggutor,100,grass,psychic,95,95,85,125,125,55
8,gengar,100,ghost,poison,60,65,60,130,130,110
9,golem,100,ground,rock,80,110,130,55,55,45


In [7]:
A=pokedex(test_data).drop_duplicates().sort_values('name').reset_index(drop=True)
A=set(A['name'])

B= opponents_pokemon(test_data).drop_duplicates().sort_values('name').reset_index(drop=True)
B=set(B['name'])

# lets see if B is subset of A
B.issubset(A)  # it is

True

We can definitevly obtain the base stats and defense type of the opponent pokemon given only his name: its a deterministic map. \
Moreover the opponents use always pokemons that appear in the p2_lead or in the p1_team.

In [8]:
# all the observed different pokemons in the p1 team and p2_lead in train and test datasets, we can do the union of the datafraemes

Tr=pokedex(train_data).drop_duplicates().sort_values('name').reset_index(drop=True)

Te=pokedex(test_data).drop_duplicates().sort_values('name').reset_index(drop=True)

All_pokemons=pd.concat([Tr,Te]).drop_duplicates().sort_values('name')['name'].drop_duplicates().reset_index(drop=True)

#compare all_pokemons with Tr['name'].drop_duplicates().reset_index(drop=True)
All_pokemons.equals(Tr['name'].drop_duplicates().reset_index(drop=True))  # it is true the test set pokemons are all in the train set, no problem of unseen pokemons

True

## What about pokemon levels ?

In [9]:
# counting the distribution of levels in the train dataset

pokedex(train_data)['level'].value_counts().sort_index()

level
55        2
100    9998
Name: count, dtype: int64

In [10]:
# counting the distribution of levels in the test dataset

pokedex(test_data)['level'].value_counts().sort_index()

level
55        1
100    4999
Name: count, dtype: int64

we can just drop as a feature the information about levels, theyare basically all 100

# Feature engineering

Lets build up our features

## avg_effectiveness (types)

each pokemon move has a multiplier that depends on 
- type of the move (move_type) from pokemon P1
- types of the opponent pokemon P2 (defense_types) 

so each couple (move_type,def_types) has a multiplier, this feature compute the avg of this multiplier for each player and then takes the difference 

In [11]:
avg_effectiveness_1_1(train_data,difference=True)

Unnamed: 0,battle_id,avg_diff,player_won
0,0,0.400000,True
1,1,-0.050000,True
2,2,0.366667,True
3,3,-0.116667,True
4,4,-0.133333,True
...,...,...,...
9995,9995,-0.283333,False
9996,9996,0.200000,False
9997,9997,-0.050000,False
9998,9998,0.116667,False


How much is 'linked' this difference to the battle outcome ?

In [12]:
# how many times P1 wins when avg_diff > 0 
# True and false positives 
avg_effectiveness_1_1(train_data,difference=True)[avg_effectiveness_1_1(train_data,difference=True)["avg_diff"]>0.0]["player_won"].value_counts(normalize=True)

player_won
True     0.701985
False    0.298015
Name: proportion, dtype: float64

In [13]:
# how many times P1 doesnt win when avg_diff < 0
# True and false negatives
avg_effectiveness_1_1(train_data,difference=True)[avg_effectiveness_1_1(train_data,difference=True)["avg_diff"]<0.0]["player_won"].value_counts(normalize=True)

player_won
False    0.695954
True     0.304046
Name: proportion, dtype: float64

we obtain 70% of true predictions only using 1 parameter

### Bonus: last 10 turns matter more ?

we have also implemented an extended version with turn segmentation so that the model could learn wich turns matter more ( start, middle, end )

In [14]:
avg_effectiveness2(train_data,difference=True,divide_turns=True)

NameError: name 'pokemon_def_types' is not defined

## Category_Impact_Score

In [None]:
category_impact_score(train_data,difference=True)

Unnamed: 0,battle_id,cat_impact_diff,player_won
0,0,0.957172,True
1,1,0.054818,True
2,2,0.135543,True
3,3,-6.463175,True
4,4,0.589761,True
...,...,...,...
9995,9995,1.130936,False
9996,9996,-2.527752,False
9997,9997,1.884878,False
9998,9998,-1.468128,False


In [None]:
# how many times P1 wins when cat_impact_diff > 0 
# True and false positives

category_impact_score(train_data,difference=True)[category_impact_score(train_data,difference=True)["cat_impact_diff"]>0]["player_won"].value_counts(normalize=True)


player_won
True     0.575228
False    0.424772
Name: proportion, dtype: float64

alone its not a good predictor because it doesnt take into account the base_power, but its not a problem for our model because we consider the base_power in other features

## avg_STAB

Same-Type Attack Bonus (STAB) means that if a Pokémon uses an attacking move of the same type as the Pokémon using it, that move gets 1.5× its usual base power. This is a huge bonus, so most Pokémon carry at least one STAB move.

In [16]:
avg_STAB_multiplier(train_data,difference=True)

NameError: name 'avg_STAB_multiplier' is not defined