In [103]:
import pandas as pd
import numpy as np
import requests
import json

### 1. We start off by getting the matches (DO NOT RUN THIS UNLESS YOU ARE SURE)

In [131]:
### get_matches returns a dataframe from the opendota API with given parameters
### don't run this too often, as there is a request limit of one per second and 50000 pr. month
def get_matches(matches=10000, start_time_less_than=1594771200):
    # we build the base url for the opendota api 
    base = 'https://api.opendota.com/api/explorer'
    # we create the SQL percent encoded query. The query is as follows
    #"""
    #SELECT *
    #FROM public_matches
    #LEFT JOIN public_player_matches
    #ON public_matches.match_id = public_player_matches.match_id
    #WHERE lobby_type=7 AND game_mode=22 AND avg_mmr!=0 AND start_time<1594771200
    #ORDER BY start_time DESC
    #LIMIT 500000
    #"""
    matches_request = f'?sql=SELECT%20*%0AFROM%20public_matches%0ALEFT%20JOIN%20public_player_matches%0AON%20public_matches.match_id%20%3D%20public_player_matches.match_id%0AWHERE%20lobby_type%3D7%20AND%20game_mode%3D22%20AND%20avg_mmr!%3D0%20AND%20start_time%3C{start_time_less_than}%0AORDER%20BY%20start_time%20DESC%0Alimit%20{str(matches*10)}'

    url = base + matches_request

    result = requests.get(url)
    print(result)
    j_matches = result.json()

    return pd.DataFrame(j_matches["rows"])

In [132]:
df_matches_response = get_matches(matches=100000, start_time_less_than=1594771200)

<Response [200]>


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### 2. Let's start inspecting our dataframe

**We do have to be mindful that all matches are duplicated 10 times, since there is a row for each picked hero in each match**

In [134]:
df_matches = df_matches_response
df_matches

Unnamed: 0,match_id,match_seq_num,radiant_win,start_time,duration,avg_mmr,num_mmr,lobby_type,game_mode,avg_rank_tier,num_rank_tier,cluster,player_slot,hero_id
0,5513437212,4623088705,True,1594771194,2467,3265,4,7,22,61,6,154,132,84
1,5513437212,4623088705,True,1594771194,2467,3265,4,7,22,61,6,154,131,8
2,5513437212,4623088705,True,1594771194,2467,3265,4,7,22,61,6,154,130,39
3,5513437212,4623088705,True,1594771194,2467,3265,4,7,22,61,6,154,129,29
4,5513437212,4623088705,True,1594771194,2467,3265,4,7,22,61,6,154,128,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,5510478900,4620408517,True,1594586685,2029,2501,1,7,22,43,5,151,4,108
999996,5510478900,4620408517,True,1594586685,2029,2501,1,7,22,43,5,151,3,27
999997,5510478900,4620408517,True,1594586685,2029,2501,1,7,22,43,5,151,2,121
999998,5510478900,4620408517,True,1594586685,2029,2501,1,7,22,43,5,151,1,93


**As seen below, all the datatypes seem to come in the right format**

In [135]:
df_matches.dtypes

match_id         int64
match_seq_num    int64
radiant_win       bool
start_time       int64
duration         int64
avg_mmr          int64
num_mmr          int64
lobby_type       int64
game_mode        int64
avg_rank_tier    int64
num_rank_tier    int64
cluster          int64
player_slot      int64
hero_id          int64
dtype: object

**There are zero null values, as this was handled in the SQL query to the API**

In [136]:
df_matches.isna().sum()

match_id         0
match_seq_num    0
radiant_win      0
start_time       0
duration         0
avg_mmr          0
num_mmr          0
lobby_type       0
game_mode        0
avg_rank_tier    0
num_rank_tier    0
cluster          0
player_slot      0
hero_id          0
dtype: int64

## 3. We now want to clean our dataframe to a usable format

**We want to go from this table:**

| match_id | ... | player_slot | hero_id |
|------|------|------|------|
|5513437212|...|Dire|84|
|5513437212|...|Dire|8|
|5513437212|...|Dire|39|
|5513437212|...|Dire|29|
|5513437212|...|Dire|80|
|5513437212|...|Radiant|46|
|5513437212|...|Radiant|75|
|5513437212|...|Radiant|96|
|5513437212|...|Radiant|4|
|5513437212|...|Radiant|25|

**To this table:**

| match_id | ... | radiant1 | radiant2 | radiant3 | radiant4 | radiant5 |  dire1 | dire2 | dire3 | dire4 | dire5 |
|------|------|------|------|------|------|------|------|------|------|------|------|
|5513437212|...|46|75|96|4|25|84|8|39|29|80|

**We write a function to do this:**

In [137]:
def table_cleaner(dataframe):
    # 0-4: Radiant team
    # 128-132: Dire team
    #If the player slot is between 0-4 the player is on the radiant team
    #if the player slot is between 128-132 the player is on the dire team
    slot_dict = {0: "radiant_1",
             1: "radiant_2",
             2: "radiant_3",
             3: "radiant_4",
             4: "radiant_5",
             128: "dire_1",
             129: "dire_2",
             130: "dire_3",
             131: "dire_4",
             132: "dire_5"}
    
    # We drop the columns we don't need
    dataframe.drop(["num_mmr", "lobby_type", "game_mode", "avg_rank_tier", "num_rank_tier"], axis=1, inplace=True)
    
    # We create columns for each player slot to make the df horisontal rather than vertical
    for slot in slot_dict:
        dataframe.loc[dataframe["player_slot"] == slot, slot_dict[slot]] = 1
    
    # We fill the NaN values with 0 and converts the columns to integers rather than floats
    dataframe = dataframe.fillna(0).astype(int)
    
    # We now multiply the binary columns with the `hero_id` to get the hero id out there. 
    # If the column is 0, it will stay 0
    # for the radiant team
    dataframe["radiant_1"] = dataframe["hero_id"] * dataframe["radiant_1"]
    dataframe["radiant_2"] = dataframe["hero_id"] * dataframe["radiant_2"]
    dataframe["radiant_3"] = dataframe["hero_id"] * dataframe["radiant_3"]
    dataframe["radiant_4"] = dataframe["hero_id"] * dataframe["radiant_4"]
    dataframe["radiant_5"] = dataframe["hero_id"] * dataframe["radiant_5"]
    # for the dire team
    dataframe["dire_1"] = dataframe["hero_id"] * dataframe["dire_1"]
    dataframe["dire_2"] = dataframe["hero_id"] * dataframe["dire_2"]
    dataframe["dire_3"] = dataframe["hero_id"] * dataframe["dire_3"]
    dataframe["dire_4"] = dataframe["hero_id"] * dataframe["dire_4"]
    dataframe["dire_5"] = dataframe["hero_id"] * dataframe["dire_5"]
    
    # We don't need the "hero_id" or "player_slot" columns anymore, so we drop them
    dataframe.drop(["hero_id", "player_slot"], axis=1, inplace=True)
    
    # We now group the matches to only have one row per match_id
    dataframe = dataframe.groupby(["match_id",
                                   "match_seq_num",
                                   "radiant_win",
                                   "start_time",
                                   "duration",
                                   "avg_mmr"]).agg({"radiant_1": "sum",
                                                    "radiant_2": "sum",
                                                    "radiant_3": "sum",
                                                    "radiant_4": "sum",
                                                    "radiant_5": "sum",
                                                    "dire_1": "sum",
                                                    "dire_2": "sum",
                                                    "dire_3": "sum",
                                                    "dire_4": "sum",
                                                    "dire_5": "sum"}).reset_index()
    
    # We want to make sure "radiant_win" stays a boolean and does not get converted to an int
    dataframe["radiant_win"] = dataframe["radiant_win"].astype(bool)
    
    # We return the processed dataframe
    return dataframe

In [147]:
df_matches_grouped

Unnamed: 0,match_id,match_seq_num,radiant_win,start_time,duration,avg_mmr,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,dire_1,dire_2,dire_3,dire_4,dire_5
0,5510478612,4620417806,True,1594586690,2721,3174,7,70,8,30,105,119,73,90,31,18
1,5510478615,4620399961,False,1594586691,1155,3623,86,49,84,75,57,40,120,30,1,60
2,5510478700,4620425526,False,1594586691,3281,3243,108,71,8,26,47,59,74,30,21,14
3,5510478702,4620421211,True,1594586695,2988,2405,34,8,10,86,53,48,18,25,85,75
4,5510478708,4620414900,True,1594586700,2277,4464,67,22,37,107,14,74,7,84,45,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5513436913,4623081152,False,1594771169,2254,3031,69,63,84,7,44,2,45,22,27,73
99996,5513436918,4623070259,True,1594771180,1688,5019,86,53,1,19,84,104,39,113,26,54
99997,5513437003,4623069709,False,1594771184,1790,864,15,108,21,30,53,126,104,110,4,84
99998,5513437208,4623072527,True,1594771189,1928,4327,27,93,6,84,81,10,67,120,101,104


## 4. Creating winrate pivot tables for synergy and counter


**We want to create pivot tables that can compute the winrate of synergy and counter in each hero pair**
**The synergy pivot table would look like this:**

| hero_id | 1 | 2 | 3 | 4 | ... | 128 | 129 |
|------|------|------|------|------|------|------|------|
| **1** |NaN|0.53|0.42|0.52|...|0.63|0.57|
| **2** |0.53|NaN|0.51|0.53|...|0.53|0.53|
| **3** |0.42|0.51|NaN|0.55|...|0.53|0.53|
| **4** |0.52|0.53|0.55|NaN|...|0.53|0.51|
| **5** |0.56|0.53|0.53|0.53|...|0.53|0.58|
|...|...|...|...|...|...|...|...|
| **128** |0.63|0.53|0.53|0.53|...|NaN|0.51|
| **129** |0.57|0.53|0.51|0.53|...|0.51|NaN|

In this case, the pivot table is mirrored as it is equally good/bad when the heroes are on the same team. 

**The counter pivot table would look like this:**

| hero_id | 1 | 2 | 3 | 4 | ... | 128 | 129 |
|------|------|------|------|------|------|------|------|
| **1** |NaN|0.47|0.58|0.48|...|0.37|0.43|
| **2** |0.53|NaN|0.49|0.47|...|0.47|0.47|
| **3** |0.42|0.51|NaN|0.45|...|0.47|0.49|
| **4** |0.52|0.53|0.55|NaN|...|0.47|0.47|
| **5** |0.56|0.53|0.53|0.53|...|0.53|0.58|
|...|...|...|...|...|...|...|...|
| **128** |0.63|0.53|0.53|0.53|...|NaN|0.49|
| **129** |0.57|0.53|0.51|0.53|...|0.51|NaN|

When it comes to the counter pivot table, the winrate will always be 1 minus the winrate for a pair. 
For instance hero 129 has a winrate of 0.63 vs hero 13. This means that hero 13 has a winrate of 0.37 vs hero 129

**Now we make a function that splits the dataset in radiant wins and dire wins**

In [138]:
def winning_side_splitter(dataframe):
    df_matches_radiant = dataframe.loc[dataframe["radiant_win"] == 1].reset_index()
    df_matches_dire = dataframe.loc[dataframe["radiant_win"] == 0].reset_index()
    return df_matches_radiant, df_matches_dire

We want to create a two pivot tables:

* Synergy wins (contains the winrate-synergy between two heroes)
* Counter wins (contains the winrate-counter between two heroes)

To get that, we need the following pivot tables:

* radiant_synergy_win
* radiant_synergy_loss
* radiant_opponent_win
* radiant_opponent_loss
* dire_synergy_win
* dire_synergy_loss
* dire_opponent_win
* dire_opponent_loss

### We start of by creating the functions for the synergy pivot table
**Now we create the function that creates the 20 combined synergy pivot tables, to later add together**

In [139]:
def pair_pivots_synergy(matches_dataframe, permutations):
    list_of_dfs = []
    name_count = 0
    pivot = {}
    for permutation1 in range(len(permutations)):
        for permutation2 in range(len(permutations)):
            if permutations[permutation1] != permutations[permutation2]:
                name_count+=1
                pivot = pd.pivot_table(matches_dataframe
                                 .groupby([permutations[permutation1], permutations[permutation2]])
                                 .agg(count=(permutations[permutation2], "count"))
                                 .reset_index()
                                 .rename(columns={permutations[permutation1]: "hero_1", permutations[permutation2]: "hero_2", "count": f"count{name_count}"}),
                                     index="hero_1",
                                     columns="hero_2",
                                     values=f"count{name_count}",
                                     dropna=False,
                                     fill_value=0)
                list_of_dfs.append(pivot)
    return list_of_dfs

**Now we combine the pivot tables into one**

In [140]:
def pivot_combiner(list_of_pivots):
    frame1 = list_of_pivots[0]
    for frame in range(len(list_of_pivots)):
        if frame != 0:
            frame1 = frame1 + list_of_pivots[frame]
    return frame1

**Now we create a wrapper to calculate the synergy winrate for each hero**

In [141]:
def synergy_wrapper(dataframe):
    permutations1 = ["radiant_1", "radiant_2", "radiant_3", "radiant_4", "radiant_5"]
    permutations2 = ["dire_1", "dire_2", "dire_3", "dire_4", "dire_5"]
    df_matches_radiant, df_matches_dire = winning_side_splitter(dataframe)
    dire_synergy_wins = pivot_combiner(pair_pivots_synergy(df_matches_dire, permutations2))
    dire_synergy_losses = pivot_combiner(pair_pivots_synergy(df_matches_radiant, permutations2))
    radiant_synergy_wins = pivot_combiner(pair_pivots_synergy(df_matches_radiant, permutations1))
    radiant_synergy_losses = pivot_combiner(pair_pivots_synergy(df_matches_dire, permutations1))
    synergy_wins = dire_synergy_wins + radiant_synergy_wins
    synergy_losses = dire_synergy_losses + radiant_synergy_losses
    synergy_winrate = synergy_wins.div((synergy_wins.add(synergy_losses)))
    return round(synergy_winrate,2)

### We will now create the functions to calculate the counter-winrate of the heroes on each team

**We now have the function to compute the synergy of two heroes on the same team.**

We want to create the same pivot tables as seen in the previous section

In [142]:
def pair_pivots_counter(matches_dataframe, permutations1, permutations2):
    list_of_dfs = []
    name_count = 0
    pivot = {}
    for permutation1 in range(len(permutations1)):
        for permutation2 in range(len(permutations2)):
            name_count+=1
            pivot = pd.pivot_table(matches_dataframe
                             .groupby([permutations1[permutation1], permutations2[permutation2]])
                             .agg(count=(permutations2[permutation2], "count"))
                             .reset_index()
                             .rename(columns={permutations1[permutation1]: "hero_1", permutations2[permutation2]: "hero_2", "count": f"count{name_count}"}),
                                 index="hero_1",
                                 columns="hero_2",
                                 values=f"count{name_count}",
                                 dropna=False,
                                 fill_value=0)
            list_of_dfs.append(pivot)
    return list_of_dfs

**Now we wrap this in a `counter_wrapper` to get our counter-pivot**

In [143]:
def counter_wrapper(dataframe):
    permutations1 = ["radiant_1", "radiant_2", "radiant_3", "radiant_4", "radiant_5"]
    permutations2 = ["dire_1", "dire_2", "dire_3", "dire_4", "dire_5"]
    df_matches_radiant, df_matches_dire = winning_side_splitter(dataframe)
    dire_counter_wins = pivot_combiner(pair_pivots_counter(df_matches_dire, permutations2, permutations1))
    dire_counter_losses = pivot_combiner(pair_pivots_counter(df_matches_radiant, permutations2, permutations1))
    radiant_counter_wins = pivot_combiner(pair_pivots_counter(df_matches_radiant, permutations1, permutations2))
    radiant_counter_losses = pivot_combiner(pair_pivots_counter(df_matches_dire, permutations1, permutations2))
    counter_wins = dire_counter_wins + radiant_counter_wins
    counter_losses = dire_counter_losses + radiant_counter_losses
    counter_winrate = counter_wins.div((counter_wins.add(counter_losses)))
    return round(counter_winrate,2)

In [144]:
df_matches_grouped = table_cleaner(df_matches)
synergy_pivot = synergy_wrapper(df_matches_grouped)
counter_pivot = counter_wrapper(df_matches_grouped)

In [145]:
df_matches_grouped.to_pickle("../data/df_matches_grouped.pkl")
synergy_pivot.to_pickle("../data/synergy_pivot.pkl")
counter_pivot.to_pickle("../data/counter_pivot.pkl")

In [148]:
pd.read_pickle("../data/df_matches_grouped.pkl")

Unnamed: 0,match_id,match_seq_num,radiant_win,start_time,duration,avg_mmr,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,dire_1,dire_2,dire_3,dire_4,dire_5
0,5510478612,4620417806,True,1594586690,2721,3174,7,70,8,30,105,119,73,90,31,18
1,5510478615,4620399961,False,1594586691,1155,3623,86,49,84,75,57,40,120,30,1,60
2,5510478700,4620425526,False,1594586691,3281,3243,108,71,8,26,47,59,74,30,21,14
3,5510478702,4620421211,True,1594586695,2988,2405,34,8,10,86,53,48,18,25,85,75
4,5510478708,4620414900,True,1594586700,2277,4464,67,22,37,107,14,74,7,84,45,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5513436913,4623081152,False,1594771169,2254,3031,69,63,84,7,44,2,45,22,27,73
99996,5513436918,4623070259,True,1594771180,1688,5019,86,53,1,19,84,104,39,113,26,54
99997,5513437003,4623069709,False,1594771184,1790,864,15,108,21,30,53,126,104,110,4,84
99998,5513437208,4623072527,True,1594771189,1928,4327,27,93,6,84,81,10,67,120,101,104


---------------------------
## Let's make a LogisticRegression

**Didn't work very well**

In [494]:
model_df = df_matches_grouped[["match_id", "radiant_1", "radiant_2", "radiant_3", "radiant_4", "radiant_5", "dire_1", "dire_2", "dire_3", "dire_4", "dire_5"]]
target_df = df_matches_grouped["radiant_win"]
model_df

Unnamed: 0,match_id,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,dire_1,dire_2,dire_3,dire_4,dire_5
0,5510478612,7,70,8,30,105,119,73,90,31,18
1,5510478615,86,49,84,75,57,40,120,30,1,60
2,5510478700,108,71,8,26,47,59,74,30,21,14
3,5510478702,34,8,10,86,53,48,18,25,85,75
4,5510478708,67,22,37,107,14,74,7,84,45,44
...,...,...,...,...,...,...,...,...,...,...,...
99995,5513436913,69,63,84,7,44,2,45,22,27,73
99996,5513436918,86,53,1,19,84,104,39,113,26,54
99997,5513437003,15,108,21,30,53,126,104,110,4,84
99998,5513437208,27,93,6,84,81,10,67,120,101,104


In [499]:
feature_set = ["radiant_1", "radiant_2", "radiant_3", "radiant_4", "radiant_5", "dire_1", "dire_2", "dire_3", "dire_4", "dire_5",]

radiant_features = (pd.get_dummies(model_df["radiant_1"], prefix="radiant") 
 + pd.get_dummies(model_df["radiant_2"], prefix="radiant")
 + pd.get_dummies(model_df["radiant_3"], prefix="radiant")
 + pd.get_dummies(model_df["radiant_4"], prefix="radiant")
 + pd.get_dummies(model_df["radiant_5"], prefix="radiant"))

dire_features = (pd.get_dummies(model_df["dire_1"], prefix="dire") 
 + pd.get_dummies(model_df["dire_2"], prefix="dire")
 + pd.get_dummies(model_df["dire_3"], prefix="dire")
 + pd.get_dummies(model_df["dire_4"], prefix="dire")
 + pd.get_dummies(model_df["dire_5"], prefix="dire"))

model_df = pd.concat([radiant_features, dire_features], axis=1)

In [500]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(model_df, target_df, test_size=0.1)

In [501]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [502]:
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

In [527]:
X_train["predictions"] = train_predictions.tolist()
X_train["target"] = y_train

X_test["predictions"] = test_predictions.tolist()
X_test["target"] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["predictions"] = train_predictions.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["target"] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["predictions"] = test_predictions.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

In [514]:
X_train.head(100)

Unnamed: 0,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,radiant_6,radiant_7,radiant_8,radiant_9,radiant_10,radiant_11,radiant_12,radiant_13,radiant_14,radiant_15,radiant_16,radiant_17,radiant_18,radiant_19,radiant_20,radiant_21,radiant_22,radiant_23,radiant_25,radiant_26,radiant_27,radiant_28,radiant_29,radiant_30,radiant_31,radiant_32,radiant_33,radiant_34,radiant_35,radiant_36,radiant_37,radiant_38,radiant_39,radiant_40,radiant_41,radiant_42,radiant_43,radiant_44,radiant_45,radiant_46,radiant_47,radiant_48,radiant_49,radiant_50,radiant_51,radiant_52,radiant_53,radiant_54,radiant_55,radiant_56,radiant_57,radiant_58,radiant_59,radiant_60,radiant_61,radiant_62,radiant_63,radiant_64,radiant_65,radiant_66,radiant_67,radiant_68,radiant_69,radiant_70,radiant_71,radiant_72,radiant_73,radiant_74,radiant_75,radiant_76,radiant_77,radiant_78,radiant_79,radiant_80,radiant_81,radiant_82,radiant_83,radiant_84,radiant_85,radiant_86,radiant_87,radiant_88,radiant_89,radiant_90,radiant_91,radiant_92,radiant_93,radiant_94,radiant_95,radiant_96,radiant_97,radiant_98,radiant_99,radiant_100,radiant_101,radiant_102,radiant_103,radiant_104,radiant_105,radiant_106,radiant_107,radiant_108,radiant_109,radiant_110,radiant_111,radiant_112,radiant_113,radiant_114,radiant_119,radiant_120,radiant_121,radiant_126,radiant_128,radiant_129,dire_1,dire_2,dire_3,dire_4,dire_5,dire_6,dire_7,dire_8,dire_9,dire_10,dire_11,dire_12,dire_13,dire_14,dire_15,dire_16,dire_17,dire_18,dire_19,dire_20,dire_21,dire_22,dire_23,dire_25,dire_26,dire_27,dire_28,dire_29,dire_30,dire_31,dire_32,dire_33,dire_34,dire_35,dire_36,dire_37,dire_38,dire_39,dire_40,dire_41,dire_42,dire_43,dire_44,dire_45,dire_46,dire_47,dire_48,dire_49,dire_50,dire_51,dire_52,dire_53,dire_54,dire_55,dire_56,dire_57,dire_58,dire_59,dire_60,dire_61,dire_62,dire_63,dire_64,dire_65,dire_66,dire_67,dire_68,dire_69,dire_70,dire_71,dire_72,dire_73,dire_74,dire_75,dire_76,dire_77,dire_78,dire_79,dire_80,dire_81,dire_82,dire_83,dire_84,dire_85,dire_86,dire_87,dire_88,dire_89,dire_90,dire_91,dire_92,dire_93,dire_94,dire_95,dire_96,dire_97,dire_98,dire_99,dire_100,dire_101,dire_102,dire_103,dire_104,dire_105,dire_106,dire_107,dire_108,dire_109,dire_110,dire_111,dire_112,dire_113,dire_114,dire_119,dire_120,dire_121,dire_126,dire_128,dire_129,predictions,target
96031,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
43674,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
82232,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
85000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
10592,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1
88429,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
6798,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
44157,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5786,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
16671,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [523]:
X_train["prediction_correct"] = np.where(X_train["predictions"] == X_train["target"], 1, 0)

X_train["prediction_correct"].sum() / len(X_train["prediction_correct"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["prediction_correct"] = np.where(X_train["predictions"] == X_train["target"], 1, 0)


0.5775333333333333

In [528]:
X_test["prediction_correct"] = np.where(X_test["predictions"] == X_test["target"], 1, 0)

X_test["prediction_correct"].sum() / len(X_test["prediction_correct"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["prediction_correct"] = np.where(X_test["predictions"] == X_test["target"], 1, 0)


0.564

In [503]:
from sklearn.metrics import r2_score

print("Training:", r2_score(y_train, train_predictions))
print("Testing:", r2_score(y_test, test_predictions))

Training: -0.697135455919792
Testing: -0.7514454647283788
