In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
%matplotlib notebook

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/esports-earnings-for-players-teams-by-game/highest_earning_teams.csv
/kaggle/input/esports-earnings-for-players-teams-by-game/highest_earning_players.csv
/kaggle/input/esports-earnings-for-players-teams-by-game/country-and-continent-codes-list.csv


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px



**My goal for this project will be to create graphics that display which demographics in esports are the most profitable, from the perspective of both a potential investor/team organizer but also a player.**

# Data Processing

In [3]:
countries = pd.read_csv('/kaggle/input/esports-earnings-for-players-teams-by-game/country-and-continent-codes-list.csv')
players = pd.read_csv('/kaggle/input/esports-earnings-for-players-teams-by-game/highest_earning_players.csv')
teams = pd.read_csv('/kaggle/input/esports-earnings-for-players-teams-by-game/highest_earning_teams.csv')

In [4]:
countries = countries.rename(columns={'Two_Letter_Country_Code': 'CountryCode'}) # matching country code column names
#countries = countries.drop("Three_Letter_Country_Code", axis=1)
countries['Country_Name'] = countries['Country_Name'].replace('United Kingdom of Great Britain & Northern Ireland', 'United Kingdom') #resizing a big country name for better chart visualization
countries = countries[['Continent_Name', 'Country_Name', 'CountryCode']]
countries['Country_Name'] = countries['Country_Name'].str.split(',').str[0] # removing all names after the comma
countries.head()

Unnamed: 0,Continent_Name,Country_Name,CountryCode
0,Asia,Afghanistan,AF
1,Europe,Albania,AL
2,Antarctica,Antarctica (the territory South of 60 deg S),AQ
3,Africa,Algeria,DZ
4,Oceania,American Samoa,AS


In [5]:
countries = countries.drop_duplicates() #removing duplicate values
countries

Unnamed: 0,Continent_Name,Country_Name,CountryCode
0,Asia,Afghanistan,AF
1,Europe,Albania,AL
2,Antarctica,Antarctica (the territory South of 60 deg S),AQ
3,Africa,Algeria,DZ
4,Oceania,American Samoa,AS
...,...,...,...
257,Africa,Zambia,ZM
258,Oceania,Disputed Territory,XX
259,Asia,Iraq-Saudi Arabia Neutral Zone,XE
260,Asia,United Nations Neutral Zone,XD


In [6]:
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PlayerId       1000 non-null   int64  
 1   NameFirst      1000 non-null   object 
 2   NameLast       1000 non-null   object 
 3   CurrentHandle  1000 non-null   object 
 4   CountryCode    1000 non-null   object 
 5   TotalUSDPrize  1000 non-null   float64
 6   Game           1000 non-null   object 
 7   Genre          1000 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [7]:
players['CountryCode'] = players['CountryCode'].str.upper() # capitalizing the countries codes
players = players.merge(countries, on='CountryCode', how='left') # merging players and countries datasets by CountryCode
players['TotalUSDPrize'] = players['TotalUSDPrize'].astype('int64') # converting prize values from float to integer
players = players.sort_values('TotalUSDPrize', ascending=False).reset_index(drop=True)
players

Unnamed: 0,PlayerId,NameFirst,NameLast,CurrentHandle,CountryCode,TotalUSDPrize,Game,Genre,Continent_Name,Country_Name
0,3304,Johan,Sundstein,N0tail,DK,6952596,Dota 2,Multiplayer Online Battle Arena,Europe,Denmark
1,3822,Jesse,Vainikka,JerAx,FI,6470000,Dota 2,Multiplayer Online Battle Arena,Europe,Finland
2,30451,Anathan,Pham,ana,AU,6000411,Dota 2,Multiplayer Online Battle Arena,Oceania,Australia
3,2811,Sébastien,Debs,Ceb,FR,5554297,Dota 2,Multiplayer Online Battle Arena,Europe,France
4,18897,Topias,Taavitsainen,Topson,FI,5470902,Dota 2,Multiplayer Online Battle Arena,Europe,Finland
...,...,...,...,...,...,...,...,...,...,...
1027,83085,En Hao,Chen,Gua,TW,26666,Arena of Valor,Multiplayer Online Battle Arena,Asia,Taiwan
1028,83083,Yu-Yan,Su,GaDuo,TW,26666,Arena of Valor,Multiplayer Online Battle Arena,Asia,Taiwan
1029,70343,-,-,Rain,TW,26645,Arena of Valor,Multiplayer Online Battle Arena,Asia,Taiwan
1030,12241,Komklit,Wongsawat,O7T-V1,TH,25941,Arena of Valor,Multiplayer Online Battle Arena,Asia,Thailand


In [8]:
relevant_countries = players.groupby('Country_Name').size().reset_index()# Filtering out Countries with less than 10 players
relevant_countries = relevant_countries[relevant_countries[0]>=10]
relevant_countries = players[players["Country_Name"].isin(relevant_countries["Country_Name"])]
relevant_countries

Unnamed: 0,PlayerId,NameFirst,NameLast,CurrentHandle,CountryCode,TotalUSDPrize,Game,Genre,Continent_Name,Country_Name
0,3304,Johan,Sundstein,N0tail,DK,6952596,Dota 2,Multiplayer Online Battle Arena,Europe,Denmark
1,3822,Jesse,Vainikka,JerAx,FI,6470000,Dota 2,Multiplayer Online Battle Arena,Europe,Finland
3,2811,Sébastien,Debs,Ceb,FR,5554297,Dota 2,Multiplayer Online Battle Arena,Europe,France
4,18897,Topias,Taavitsainen,Topson,FI,5470902,Dota 2,Multiplayer Online Battle Arena,Europe,Finland
5,3145,Kuro,Takhasomi,KuroKy,DE,5193382,Dota 2,Multiplayer Online Battle Arena,Europe,Germany
...,...,...,...,...,...,...,...,...,...,...
1027,83085,En Hao,Chen,Gua,TW,26666,Arena of Valor,Multiplayer Online Battle Arena,Asia,Taiwan
1028,83083,Yu-Yan,Su,GaDuo,TW,26666,Arena of Valor,Multiplayer Online Battle Arena,Asia,Taiwan
1029,70343,-,-,Rain,TW,26645,Arena of Valor,Multiplayer Online Battle Arena,Asia,Taiwan
1030,12241,Komklit,Wongsawat,O7T-V1,TH,25941,Arena of Valor,Multiplayer Online Battle Arena,Asia,Thailand


In [9]:
teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 928 entries, 0 to 927
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TeamId            928 non-null    int64  
 1   TeamName          928 non-null    object 
 2   TotalUSDPrize     928 non-null    float64
 3   TotalTournaments  928 non-null    int64  
 4   Game              928 non-null    object 
 5   Genre             928 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 43.6+ KB


In [10]:
teams['TotalUSDPrize'] = teams['TotalUSDPrize'].astype('int64')
teams

Unnamed: 0,TeamId,TeamName,TotalUSDPrize,TotalTournaments,Game,Genre
0,760,San Francisco Shock,3105000,7,Overwatch,First-Person Shooter
1,776,London Spitfire,1591136,13,Overwatch,First-Person Shooter
2,768,New York Excelsior,1572618,18,Overwatch,First-Person Shooter
3,773,Philadelphia Fusion,1186278,15,Overwatch,First-Person Shooter
4,766,Seoul Dynasty,1130000,6,Overwatch,First-Person Shooter
...,...,...,...,...,...,...
923,24781,Rex Regum Qeon,6286,2,Arena of Valor,Multiplayer Online Battle Arena
924,261,Alliance,4000,1,Arena of Valor,Multiplayer Online Battle Arena
925,713,Marines Esports,3429,1,Arena of Valor,Multiplayer Online Battle Arena
926,608,British National Team,2500,1,Arena of Valor,Multiplayer Online Battle Arena


# Top Players

In [11]:
top_players = players[['NameFirst', 'CurrentHandle', 'TotalUSDPrize', 'Game', 'Country_Name']].head(20).sort_values('TotalUSDPrize', ascending=True)
top_players = top_players[['CurrentHandle', 'TotalUSDPrize', 'Game']]
top_players['CurrentHandle'] = top_players['CurrentHandle'].replace('Somnus丶M', 'Somnus')
fig = px.bar(top_players, y='CurrentHandle', x='TotalUSDPrize', color='Game', title = 'Top 20 Players', barmode='group',color_discrete_sequence=px.colors.qualitative.Pastel, orientation='h')
fig.update_layout(legend_title='Game')
fig.update_xaxes(title_text='Total Prize (in millions USD)')  # Set x-axis label
fig.update_yaxes(title_text='In-Game Tag')
fig.show()

It appears that nearly every player in the top 20 players plays DOTA 2.

# Players count by country

In [12]:
ranking = relevant_countries['Country_Name'].value_counts().sort_values(ascending=True)
fig = px.bar(ranking, y= ranking.index, x= ranking.values, orientation = 'h', title = 'Players by Country', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_xaxes(title_text='Player Counts')  # Set x-axis label
fig.update_yaxes(title_text='Country Name')
fig.show()

Looks like Korea and China have far and away the most top players - they must be doing something right!

# Average Prize by Country

In [13]:
earnings_per_country = relevant_countries.groupby('Country_Name').mean()
earnings_per_country = earnings_per_country.sort_values(ascending=True, by='TotalUSDPrize')
fig = px.bar(earnings_per_country, y= earnings_per_country.index, x= ['TotalUSDPrize'], orientation = 'h', title = 'Average Prize by Country', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_xaxes(title_text='Total Prize (in millions USD)')  # Set x-axis label
fig.update_yaxes(title_text='Countries') #set y-axis label
fig.update_layout(legend_title='TotalUSDPrize', showlegend=False)
fig.show()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Looks like the many of the most successful countries at esports are European, with the Nordic countries Finland and Denmark leading the pack, and Russia/Ukraine in the top 5 as well. 

# Prize by genre

In [14]:
genre = relevant_countries.groupby(['Genre']).sum().sort_values(ascending=True, by='TotalUSDPrize')
fig = px.bar(genre, y= genre.index, x= 'TotalUSDPrize', orientation = 'h', title = 'Prize by Genre', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_xaxes(title_text='Total Prize (in millions USD)')  # Set x-axis label
fig.update_yaxes(title_text='Genre') #set y-axis label
fig.update_layout(legend_title='TotalUSDPrize', showlegend=False)
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



MOBA games seem to have the most money invested into tournament prizes - although the other genres like FPS have a substantial earning capability as well.

# Prize by teams

In [15]:
prize_team = teams.groupby(['TeamName'])['TotalUSDPrize'].sum().sort_values(ascending=True).tail(20)
fig = px.bar(prize_team, y= prize_team.index, x= 'TotalUSDPrize', orientation = 'h', title = 'Prize by Teams', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_xaxes(title_text='Total Prize (in millions USD)')  # Set x-axis label
fig.update_yaxes(title_text='Genre') #set y-axis label
fig.update_layout(legend_title='TotalUSDPrize', showlegend=False)
fig.show()


If you are looking invest into any given team, looks like OG (which focuses on DOTA 2) and Team Liquid (which has various games) are signing players with the most cash.

# Prize by Game

In [16]:
games_earnings = teams.groupby(['Game'])['TotalUSDPrize'].sum().sort_values(ascending=True)
fig = px.bar(games_earnings, y= games_earnings.index, x= 'TotalUSDPrize', orientation = 'h', title = 'Prize by Game', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_xaxes(title_text='Total Prize (in millions USD)')  # Set x-axis label
fig.update_yaxes(title_text='Game') #set y-axis label
fig.update_layout(legend_title='TotalUSDPrize', showlegend=False)
fig.show()


To no-one's surprise, it looks like DOTA 2 has the biggest prizes surrounding its scene, with the runner-ups CSGO and League of Legends not even coming close.

# Average Tournament Prize by Game

In [17]:
games_earnings_index = teams.groupby(['Game'])['TotalUSDPrize'].sum().sort_index()
tournaments_count_index = teams.groupby('Game')['TotalTournaments'].sum().sort_index()
avrg = (games_earnings_index / tournaments_count_index).astype('int64')
avrg = avrg.sort_values(ascending=True)
fig = px.bar(avrg, y= avrg.index, x= avrg.values, orientation = 'h', title = 'Average Prize by Game', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_xaxes(title_text='Average Tournament Prize (in millions USD)')  # Set x-axis label
fig.update_yaxes(title_text='Game') #set y-axis label
fig.update_layout(legend_title='TotalUSDPrize', showlegend=False)
fig.show()

While DOTA 2 still takes the first place when it comes to average tournament prize, Arena of Valor, the game with the lowest total prize pool actually comes a close second for the highest average. It looks like MOBA's are the way to go if you want to strike it rich.

# **What are the chances you'll earn at least a million dollars depending on the esport you play?**

Let's develop a logistic regression model to figure this out.

In [18]:
#one-hot-encode the data
one_hot_encoded = pd.get_dummies(players['Game'])
one_hot_encoded
encoded_data = pd.concat([players, one_hot_encoded], axis=1)
encoded_data

#classify dependent variable
encoded_data['TotalUSDPrize'] = encoded_data['TotalUSDPrize'].apply(lambda x: 1 if x > 1000000 else 0)

In [19]:
# Define our features/design matrix X
X = encoded_data[["Counter-Strike: Global Offensive", "Dota 2", "Fortnite", "Hearthstone", "Heroes of the Storm", "League of Legends", "Overwatch", "PUBG", "Starcraft II"]]
Y = encoded_data['TotalUSDPrize']

In [20]:
#split our data into training and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=42)
print(f"Training Data Size: {len(X_train)}")
print(f"Test Data Size: {len(X_test)}")
X_train

Training Data Size: 774
Test Data Size: 258


Unnamed: 0,Counter-Strike: Global Offensive,Dota 2,Fortnite,Hearthstone,Heroes of the Storm,League of Legends,Overwatch,PUBG,Starcraft II
305,1,0,0,0,0,0,0,0,0
221,0,0,0,0,0,1,0,0,0
235,1,0,0,0,0,0,0,0,0
697,0,0,0,0,1,0,0,0,0
525,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
87,0,1,0,0,0,0,0,0,0
330,0,0,0,0,0,0,0,0,1
466,0,0,1,0,0,0,0,0,0
121,0,0,0,0,0,1,0,0,0


In [21]:
#fit our model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(fit_intercept=True,penalty='l2')
lr.fit(X_train, Y_train)
lr.intercept_, lr.coef_
lr.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,

Let's test our model on League of Legends!

In [22]:
#Testing Accuracy, Precision, and Recall of our model
train_accuracy = np.sum(lr.predict(X_train) == Y_train)/len(Y_train)
test_accuracy = np.sum(lr.predict(X_test) == Y_test)/len(Y_test)

print(f"Train accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")
Y_test_pred = lr.predict(X_test)

precision = sum((Y_test_pred == Y_test) & (Y_test_pred == 1))/ sum(Y_test_pred)
recall = sum((Y_test_pred == Y_test) & (Y_test_pred == 1))/ sum(Y_test)

print(f'precision = {precision:.4f}')
print(f'recall = {recall:.4f}')

Train accuracy: 0.9406
Test accuracy: 0.9535
precision = 0.7600
recall = 0.7600


Let's test out our model on a hypothetical League of Legends player.

In [23]:
lol = X_test.loc[314].values.reshape(1,-1)
predicted_probabilities = lr.predict_proba(lol)
predicted_probabilities


X does not have valid feature names, but LogisticRegression was fitted with feature names



array([[0.98918396, 0.01081604]])

The probability of you making over a million dollars in your career is 1 percent if you play League of Legends. Not looking good for LoL fans out there. What about DOTA 2?

In [24]:
dota2 = X_test.loc[3].values.reshape(1,-1)
predicted_probabilities = lr.predict_proba(dota2)
predicted_probabilities


X does not have valid feature names, but LogisticRegression was fitted with feature names



array([[0.41817199, 0.58182801]])

Dota players really have it made. Top players have a 58 percent chance of making over a million dollars over their career. 

# What did we learn?

If you are an aspiring top player, an esports organization looking to specialize, or a potential investor, its clear that MOBA games, specifically DOTA 2, is the hottest and most profitable venture to invest in. With a majority of top players being millionares, the average tournament prizes being the highest, and the teams earning the most, its clear this game is the most economically profitable.