# Goal: 
## Predict the outcome of a game based on what heroes have been picked 

In [6]:
# import helper_functions.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# %matplotlib inline
plt.style.use('ggplot')

import requests
from bs4 import BeautifulSoup
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('dota_games.txt', sep=",", header=None, names=["Sentinal_1", "Sentinal_2", 
                                                                "Sentinal_3", "Sentinal_4", "Sentinal_5",
                                                                "Scourge_1", "Scourge_2", "Scourge_3", 
                                                                "Scourge_4", "Scourge_5", "Won"])
df.head(8)

In [None]:
# check for fragmented data
df.isnull().any()

In [None]:
# create a new df with all the winning teams lineups
sentinal_df = df[['Sentinal_1','Sentinal_2','Sentinal_3','Sentinal_4','Sentinal_5','Won']]
sentinal_winners = sentinal_df[df.Won == 1]
sentinal_winners.rename(columns={'Sentinal_1': "Hero_1", 'Sentinal_2': "Hero_2", 
                               'Sentinal_3': "Hero_3", 'Sentinal_4': "Hero_4",
                               'Sentinal_5': "Hero_5"}, inplace=True)

scourge_df = df[['Scourge_1','Scourge_2','Scourge_3','Scourge_4','Scourge_5','Won']]
scourge_winners = scourge_df[df.Won == 2]
scourge_winners['Won'] = 1
scourge_winners.rename(columns={'Scourge_1': "Hero_1", 'Scourge_2': "Hero_2", 
                               'Scourge_3': "Hero_3", 'Scourge_4': "Hero_4",
                               'Scourge_5': "Hero_5"}, inplace=True)

# create a new df for the losing teams

sentinal_losers = sentinal_df[df.Won == 2]
sentinal_losers['Won'] = 0
sentinal_losers.rename(columns={'Sentinal_1': "Hero_1", 'Sentinal_2': "Hero_2", 
                               'Sentinal_3': "Hero_3", 'Sentinal_4': "Hero_4",
                               'Sentinal_5': "Hero_5"}, inplace=True)

scourge_losers = scourge_df[df.Won == 1]
scourge_losers['Won'] = 0
scourge_losers.rename(columns={'Scourge_1': "Hero_1", 'Scourge_2': "Hero_2", 
                               'Scourge_3': "Hero_3", 'Scourge_4': "Hero_4",
                               'Scourge_5': "Hero_5"}, inplace=True)

# merge the dataframes to create a df that contains all the lineups and if they won or not
lineups = pd.concat([sentinal_winners, scourge_winners, sentinal_losers, scourge_losers])
lineups.reset_index(drop=True, inplace=True)
lineups.head()

In [None]:
# split the hero list from the outcome of the game
features_df = lineups.drop('Won', axis=1)
# features_df.head()

# def populate_df(df):
#     uniques = pd.unique(df.values.ravel('K'))
#     zeros = np.zeros(len(uniques))
    
    
#     all_dummies = []
#     for row in df.itertuples():
#         i = 1
#         uniques_dic = dict(zip(uniques, zeros))
#         while i < 6:
#             uniques_dic[row[i]] = 1
#             i += 1
            
#         all_dummies.append(uniques_dic)       
#     return pd.DataFrame(all_dummies, columns=uniques)

dummies_df = hlf.populate_df(features_df)
no_features_df = pd.concat([dummies_df, lineups.Won], axis=1)
# dummies_csv = no_features_df.to_csv ('dota2_dummies.csv')

In [None]:
# check for class imbalance
outcome_df = pd.get_dummies(lineups.Won)
outcome_df = outcome_df.rename(columns={0:'Sentinal', 1: 'Scourge'})
sum_wins_by_team = pd.concat([dummies_df, outcome_df], axis=1)

sum_wins_by_team = sum_wins_by_team.groupby(['Sentinal', 'Scourge']).sum()
sum_wins_by_team.reset_index(inplace=True)

sum_wins_by_team['Sentinal'][1] = len(sentinal_winners)
sum_wins_by_team['Sentinal'][0] = (15000 - len(sentinal_winners))
sum_wins_by_team.drop('Scourge', axis=1, inplace=True)

# Plot the bar chat to show how many times each team won 
plt.figure(figsize=(10, 7.5)) 
plt.bar(['Sentinal', 'Scourge'], [len(sentinal_winners), (15000-len(sentinal_winners))], alpha=0.4)
plt.grid(b=None)

# Remove the plot frame lines.
ax = plt.subplot(111)  
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)
plt.grid(b=None)

# set labels
plt.title("Winning Teams Histograms")
plt.xlabel("Team (Sentinal/Scourge)", fontsize=16)  
plt.ylabel("Matches Won", fontsize=16)  
plt.show()
plt.savefig('classes_hist.png')

In [None]:
# def character_by_attributes(attribute):
#     '''This function scrapes all of the heroes by their main attribute
#        off of dota2 wikisite'''
#     # Make a get request to retrieve the page
#     html = requests.get(f'https://dota2.gamepedia.com/{attribute}') 

#     # Pass the page contents to beautiful soup for parsing
#     soup = BeautifulSoup(html.content, 'html.parser')

#     # search for the characters by attribute
#     char_raw = soup.findAll('td',{'style':"white-space:nowrap;"})
#     char_list = np.array([item.find('a').attrs['href'].replace('/', '') for item in char_raw])
#     return char_list

In [None]:
# Sort characters based on their main attribute (Strength, Agility, Intellegence)
str_heroes = hlf.character_by_attributes('Strength')
agi_heroes = hlf.character_by_attributes('Agility')
int_heroes = hlf.character_by_attributes('Intelligence')

# removing some extra special characters
agi_heroes = [x.replace('_', ' ') for x in agi_heroes]
int_heroes = [x.replace('_', ' ') for x in int_heroes]
int_heroes = [x.replace('%27', "'") for x in int_heroes]

In [None]:
# create a new feature: how many strength heros
str_df = features_df.where(~features_df.isin(str_heroes), None)
str_df = pd.DataFrame(str_df.isnull().sum(axis=1), columns=['Strength'])
str_df = str_df / 5
# str_df.head()

In [None]:
# create a new feature: how many agility heros
agi_df = features_df.where(~features_df.isin(agi_heroes), None)
agi_df = pd.DataFrame(agi_df.isnull().sum(axis=1), columns=['Agility'])
agi_df = agi_df / 5
# agi_df.head()

In [None]:
# create a new feature: how many inteligence heros
int_df = features_df.where(~features_df.isin(int_heroes), None)
int_df = pd.DataFrame(int_df.isnull().sum(axis=1), columns=['Intelligence'])
int_df = int_df / 5
# int_df.head()

In [None]:
# ranged heros
ranged = np.array(['Techies', 'Ancient Apparition', 'Lina', 'Chen', 'Enchantress', 
                   'Mirana', 'Pugna', 'Arc Warden', 'Disruptor', 'Drow Ranger', 
                   'Skywrath Mage', 'Oracle', 'Crystal Maiden', 'Death Prophet', 'Invoker', 
                   'Keeper of the Light', 'Leshrac', 'Lion', 'Medusa', "Nature's Prophet", 
                   'Silencer', 'Visage', 'Warlock', 'Windrunner', 'Witch Doctor',  'Clinkz', 
                   'Viper', 'Dazzle', 'Lich', 'Lone Druid', 'Puck', 'Queen of Pain', 'Rubick', 
                   'Sniper', 'Enigma', 'Wisp', 'Necrophos', 'Phoenix', 'Shadow Demon', 
                   'Shadow Fiend', 'Tinker', 'Troll Warlord', 'Storm Spirit', 'Razor', 
                   'Outworld Devourer', 'Venomancer', 'Weaver', 'Winter Wyvern', 'Bane', 
                   'Huskar', 'Jakiro', 'Shadow Shaman', 'Vengeful Spirit', 'Zeus', 'Batrider', 
                   'Gyrocopter', 'Morphling', 'Luna'])
# create a new feature: how many melee heros in the team
melee_df = features_df.where(~features_df.isin(ranged), None)
melee_df = pd.DataFrame(melee_df.isnull().sum(axis=1), columns=['Ranged'])
melee_df = melee_df / 5

# melee_df.tail()

In [None]:
# tier  list
tier1 = np.array(['Alchemist', 'Enchan', 'Lifestealer', 'Visage',
                 'Wisp', 'Outerworld Devourer', 'Weaver',
                 "Nature's Prophet", 'Rubick'])

tier2 = np.array(['Nyx Assassin', 'Gyrocopter', 
                  'Alchemist', 'Puck', 'Chen'])

tier3 = np.array(['Shadow Demon', 'Lone Druid', 'Clockwerk', 'Queen of Pain',
                  'Dragon Knight', 'Treant Protector', 'Keeper of the Light'
                  'Bane', 'Jakiro', 'Anti-Mage', 'Timbersaw', 'Storm Spirit',
                  'Razor', 'Enchantress', 'Bounty Hunter', 'Templar Assassin',
                  'Abaddon', 'Crystal Maiden', 'Windrunner', 'Magnus', 
                  'Shadow Fiend', 'Leshrac'])

# create a new feature: how many characters are tier 1,2 or 3
op_tiers12_df = features_df.where(~features_df.isin(tier1) & ~features_df.isin(tier2), None)
op_tiers12_df = pd.DataFrame(op_tiers12_df.isnull().sum(axis=1), columns=['Tiers_12'])
op_tiers12_df = op_tiers12_df / 5
# op_tiers12_df.head()

op_tiers3_df = features_df.where(~features_df.isin(tier3), None)
op_tiers3_df = pd.DataFrame(op_tiers3_df.isnull().sum(axis=1), columns=['Tier_3'])
op_tiers3_df = op_tiers3_df / 5
# op_tiers3_df.head()

In [None]:
# Safe lane carry characters
carry = np.array(['Alchemist', 'Anti-Mage', 'Arc Warden', 'Bloodseeker', 'Clinkz'
                  'Chaos Knight', 'Drow Ranger', 'Faceless Void', 'Gyrocopter',
                  'Huskar', 'Juggernaut', 'Kunkka', 'Lifestealer', 'Lone Druid',
                  'Luna', 'Lycan' 'Medusa', 'Meepo', 'Morphling', 'Naga Siren',
                  'Phantom Assassin', 'Phantom Lancer', 'Slark', 'Spectre',
                  'Sven', 'Terrorblade', 'Timbersaw', 'Troll Warlord', 'Tiny',
                  'Ursa', 'Vengeful Spirit', 'Venomancer', 'Viper', 'Weaver'])

# create new feature: amount of carry character in a team
carry_df = features_df.where(~features_df.isin(carry), None)
carry_df = pd.DataFrame(carry_df.isnull().sum(axis=1), columns=['Carry']) 
carry_df = carry_df / 5
# carry_df.head()

In [None]:
# support characters
support = np.array(['Ancient Apparition', 'Bristleback', 'Clockwerk', 'Chen',
                    'Bane', 'Crystal Maiden', 'Dazzle', 'Disruptor', 'Doom',
                    'Earthshaker', 'Enchantress', 'Wisp', 'Jakiro', 'Leshrac'
                    'Keeper of the Light', 'Lich', 'Lina', 'Lion', 'Nyx Assassin',
                    'Ogre Magi', 'Omniknight', 'Oracle', 'Phoenix', 'Rubick',
                    'Shadow Demon', 'Shadow Shaman', 'Silencer', 'Treant Protector',
                    'Undying', 'Visage', 'Warlock', 'Weaver', 'Windrunner', 'Winter Wyvern',
                    'Witch Doctor', 'Oracle'])

# create new feature: amount of support characters in a team
support_df = features_df.where(~features_df.isin(support), None)
support_df = pd.DataFrame(support_df.isnull().sum(axis=1), columns=['Support'])
support_df = support_df / 5
# support_df.head()

In [None]:
#offlane characters
offlane = np.array(['Abaddon', 'Axe', 'Batrider', 'Beasmaster',
                    'Brewmaster', 'Bristleback', 'Broodmother',
                    'Centaur Warrunner', 'Clockwerk', 'Dark Seer',
                    'Doom', 'Earthshaker', 'Enchantress', 'Enigma',
                    'Legion Commander', 'Magnus', "Nature's Prophet",
                    'Night Stalker', 'Nyx Assassin', 'Puck',
                    'Sand King', 'Slardar', 'Spirit Breaker', 
                    'Tidehunter', 'Undying', 'Underlord'])

# create new feature: amount of offlane characters in a team
offlane_df = features_df.where(~features_df.isin(offlane), None)
offlane_df = pd.DataFrame(offlane_df.isnull().sum(axis=1), columns=['Offlane'])
offlane_df = offlane_df / 5
# offlane_df.head()

In [None]:
# middle lane characters
mid = np.array(['Alchemist', 'Arc Warden', 'Bloodseeker', 'Bristleback',
                'Broodmother', 'Death Prophet', 'Dragon Knight', 'Ember Spirit',
                'Faceless Void', 'Invoker', 'Wisp', 'Juggernaut',
                'Lina', 'Leshrac', 'Lone Druid', 'Magnus', 'Medusa', 'Meepo',
                'Mirana', "Nature's Prophet", 'Necrophos', 'Outworld Devourer',
                'Puck', 'Pugna', 'Queen of Pain', 'Razor', 'Shadow Fiend','Zeus' 
                'Skywrath Mage', 'Sniper', 'Storm Spirit', 'Templar Assassin',
                'Tinker', 'Tiny', 'Venomancer', 'Viper', 'Visage', 'Windrunner'])

# create new feature: how many mid lane appropriate characters
mid_df = features_df.where(~features_df.isin(mid), None)
mid_df = pd.DataFrame(mid_df.isnull().sum(axis=1), columns=['Mid'])
mid_df = mid_df /5
# mid_df.head()

In [None]:
# roaming support characters
roaming_support = np.array(['Bounty Hunter', 'Earth Spirit', 'Mirana', "Nature's Prophet",
                            'Pudge', 'Riki', 'Tusk', 'Techies', 'Spirit Breaker', 'Bane'
                            'Night Stalker', 'Windrunner', 'Lion', 'Earthshaker', 'Rubick'])

# create new feature: how many roaming characters
roam_df = features_df.where(~features_df.isin(roaming_support), None)
roam_df = pd.DataFrame(roam_df.isnull().sum(axis=1), columns=['Roamer'])
roam_df = roam_df / 5
# roam_df.head()

In [None]:
# nuke/disable characters
nuke_disable = np.array(['Crystal Maiden', 'Lina', 'Lion', 'Oracle', 'Skywrath Mage', 'Witch Doctor',
                         'Earthshaker', 'Death Prophet', 'Lich', 'Ancient Apparition', 'Shadow Shaman',
                         'Puck', 'Jakiro', 'Necrophos', 'Leshrac', 'Disruptor', 'Outworld Devourer',
                         'Bane', 'Ogre Magi', 'Zeus', 'Doom', 'Dazzle', 'Morphling', 'Nyx Assassin',
                         'Rubick', 'Queen of Pain', 'Tinker'])

# create new feature: how many nukers/disablers in team
nuke_disable_df = features_df.where(~features_df.isin(nuke_disable), None)
nuke_disable_df = pd.DataFrame(nuke_disable_df.isnull().sum(axis=1), columns=['Nuke/Disable'])
nuke_disable_df = nuke_disable_df / 5
# nuke_disable_df.head()

In [None]:
# see which hereos were picked the most games
sum_picked = pd.DataFrame(dummies_df.sum(), columns=["Picked"])
sum_picked.sort_values(by=['Picked'], ascending=False, inplace=True)

sum_picked = sum_picked[['Picked']].apply(lambda x: x / 15000)
sum_picked.reset_index(inplace=True)
sum_picked.rename(columns={'index':'Hero'}, inplace=True)

top15_picked = sum_picked.head(15)

# plot the top winners
plt.figure(figsize=(12,8))
sns.catplot(kind='bar', data=top15_picked, x='Hero', y='Picked')
plt.title('Times Picked')
plt.ylabel('Picked (%)')
plt.xticks(rotation=45, ha='right')

# create new feature: how many from the characters with the most wins in a team
top_picked_df = features_df.where(~features_df.isin(top15_picked), None)
top_picked_df = pd.DataFrame(top_picked_df.isnull().sum(axis=1), columns=['Winners'])
# top_picked_df.head()

In [None]:
# see which hereos won the most games
winners_df = no_features_df.loc[no_features_df.Won == 1]
winners_df.drop('Won', axis=1, inplace=True)
sum_wins = pd.DataFrame(winners_df.sum(), columns=["Win"])
sort_sum_wins = sum_wins.sort_values(by=['Win'], ascending=False, inplace=True)

sort_sum_wins = sum_wins[['Win']]
sort_sum_wins.reset_index(inplace=True)
sort_sum_wins.rename(columns={'index':'Hero'}, inplace=True)

top15_wins = sort_sum_wins.head(15)

# plot the top winners
plt.figure(figsize=(12,8))
sns.catplot(kind='bar', data=top15_wins, x='Hero', y='Win')
plt.title('Times Won')
plt.ylabel('Matches Won')
plt.xticks(rotation=45, ha='right')

# create new feature: how many from the characters with the most wins in a team
top_wins_df = features_df.where(~features_df.isin(top15_wins), None)
top_wins_df = pd.DataFrame(top_wins_df.isnull().sum(axis=1), columns=['Winners'])
# top_wins_df.head()

In [None]:
# see which heroes lost the least
losers_df = no_features_df.loc[no_features_df.Won == 0]
losers_df.drop('Won', axis=1, inplace=True)
sum_lose = pd.DataFrame(losers_df.sum(), columns=["Lose"])
sum_lose.sort_values(by=['Lose'], ascending=False)

sort_sum_lose = sort_sum_lose[['Lose']]
sort_sum_lose.reset_index(inplace=True)
sort_sum_lose.rename(columns={'index':'Hero'}, inplace=True)

top15_lose = sort_sum_lose.head(15)

# plot the top winners
plt.figure(figsize=(12,8))
sns.catplot(kind='bar', data=top15_lose, x='Hero', y='Lose')
plt.title('Times Lost')
plt.ylabel('Matches Lost')
plt.xticks(rotation=45, ha='right')

# create new feature: how many from the characters with the most losessee which charaacters lost the most
top_lose_df = features_df.where(~features_df.isin(top15_lose), None)
top_lose_df = pd.DataFrame(top_lose_df.isnull().sum(axis=1), columns=['Lost'])
# top_lose_df.head()                    

In [None]:
win_lose_df = pd.concat([sum_wins, sum_lose], axis=1).reset_index().rename(columns={'index':'Hero'})
win_lose_df['Ratio'] = win_lose_df['Win'] / win_lose_df['Lose']

# sort the characters by win/lose ratio
win_lose_df.sort_values(by=['Ratio'], ascending=False, inplace=True)
sort_ratio = win_lose_df[['Hero','Ratio']]


# save the win lose df as csv for future use.
ratio_to_csv = win_lose_df.to_csv('dota_win_ratio.csv')

# extract the characters with highest/lowest win to lose ratio
top15_ratio = sort_ratio.head(15)
bot15_ratio = sort_ratio.tail(15)

# plot the highest win to lose ratio
plt.figure(figsize=(12,8))
sns.catplot(kind='bar', data=top15_ratio, x='Hero', y='Ratio')
plt.title('Win Lose Ratio')
plt.ylabel('Win/Lose Ratio')
plt.xticks(rotation=45, ha='right')

# create 2 new feature: lowest and highest win to lose ratio charachters
high_ratio_df = features_df.where(~features_df.isin(top15_ratio), None)
high_ratio_df = pd.DataFrame(high_ratio_df.isnull().sum(axis=1), columns=['Lost'])
# high_ratio_df.head()     

low_ratio_df = features_df.where(~features_df.isin(bot15_ratio), None)
low_ratio_df = pd.DataFrame(low_ratio_df.isnull().sum(axis=1), columns=['Lost'])
# high_ratio_df.head()                    

In [None]:
# 2018 most used characters - patch 7.19
picked_719 = np.array(['Mirana', 'Earthshaker', 'Tiny', 'Necrophos', 'Weaver',
                        'Winter Wyvern', 'Phoenix', 'Lina', 'Bane', 'Phantom Lancer'])

# create new feature: how many from the characters from the top win rate
picked_719_df = features_df.where(~features_df.isin(picked_719), None)
picked_719_df = pd.DataFrame(picked_719_df.isnull().sum(axis=1), columns=['Most_Picked_719'])
picked_719_df = picked_719_df / 5
# picked_719_df.head()

In [None]:
# 2018 most used characters - patch 7.20
picked_720 = np.array(['Rubick', 'Lich', 'Phantom Assassin', 'Brewmaster', 'Centaur Warrunner',
                        'Dazzle', 'Sand King', 'Tusk', 'Terrorblade'])

# create new feature: how many from the characters from the top win rate
picked_720_df = features_df.where(~features_df.isin(picked_720), None)
picked_720_df = pd.DataFrame(picked_720_df.isnull().sum(axis=1), columns=['Most_Picked_720'])
picked_720_df = picked_720_df / 5
# picked_720_df.head()

In [None]:
# concatanate all the features together
cleaned_df = pd.concat([dummies_df, melee_df, str_df, agi_df, int_df, 
                         op_tiers12_df, op_tiers3_df, carry_df, support_df, 
                         offlane_df, mid_df, roam_df, nuke_disable_df, top_wins_df,
                         high_ratio_df, low_ratio_df, high_ratio_dftop_lose_df, 
                         top_picked_df, picked_719_df, picked_720_df, 
                         lineups['Won']], axis=1)

# scaling the dataset
cols = cleaned_df.columns
scaler = StandardScaler()
scaler.fit(cleaned_df)
scaled_features = scaler.transform(cleaned_df)

scaled_features_df = pd.DataFrame(scaled_features, columns=cols, index=cleaned_df.index)
scaled_features_df.head()

# save the new dataframe as a csv file
# export_csv = cleaned_df.to_csv (r'C:\Users\GILOR\Desktop\data_science\Flatiron_projects\Dota_Victory_Classification\dota2_cleaned.csv', 
#                                  index = None, header=True)

scaled_csv = scaled_features_df.to_csv('dota2_scaled.csv)

scaled_features_df.tail()

In [None]:
# test_df = pd.concat([dummies_df, melee_df, str_df, agi_df, int_df, 
#                      carry_df, support_df, offlane_df, mid_df, roam_df, 
#                      nuke_disable_df, top_wins_df, lineups['Won']], axis=1)
# debug_csv = test_df.to_csv (r'C:\Users\GILOR\Desktop\data_science\Flatiron_projects\Dota_Victory_Classification\dota2_debugging.csv', 
#                                  index = None, header=True)

In [None]:
def create_hist(df, column, save=None):
    plt.figure(figsize=(8, 5))  

    # Remove the plot frame lines. 
    ax = plt.subplot(111)  
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False)  
    
    # set labels
    plt.title(f"{column} Heroes in Team Histogram")
    plt.xlabel(f"{column} Per Team", fontsize=16)  
    plt.ylabel("Matches Won", fontsize=16)  

    # Plot the histogram  
    plt.hist(df[column], bins=5, alpha=0.7, density=True)
    plt.show()
    if save:
        plt.savefig(f'{column}_Histogram.png')
    pass

In [None]:
# evaluate how the winning teams were set up
winners_df = cleaned_df.loc[cleaned_df.Won == 1]
for column in winners_df.columns:
    if column not in dummies_df.columns and column != 'Won':
        hlf.create_hist(winners_df, column=column)