## PSEUDOCODE STRUCTURE

1. API Call #1 to get all tournament IDs (or at least the top like 1000)
2. API Call #2 to get all the data for each tournament ID we scraped in step 1
3. Histogram to show year by year total earnings potential in these tournaments (maybe by month/year if available)
4. Visualize highest earning games, highest earning players, and highest average games
5. LDA to generate fake tournament names for fun

In [4]:
import requests
import pandas as pd
import json
import csv
import time
import datetime as dt

In [5]:
import plotly as py
import matplotlib as plt
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly_express as px
import cufflinks as cf
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
py.offline.init_notebook_mode() #this sets plotly to offline mode (aka no cloud comms)
cf.go_offline(connected=True)

ModuleNotFoundError: No module named 'plotly'

In [None]:
#this cell will call the API and gather all of the direct tournament data in a JSON file
tidjson = pd.DataFrame(columns = ['TournamentId', 'GameId', 'TournamentName', 'StartDate', 'EndDate'
                                 , 'Location', 'Teamplay' , 'TotalUSDPrize'])
offsetiter = 0
"""
Richie:
Could use: for offset in range(0, 100, 30_000) if you want, up to you

Put your API public key into environment variables.
There's probably a file at ~/.bash_profile or something like that
Hackers type the following to edit the bash profile:
nano ~/.bash_profile
Go to the end of the file and type
export ESPORTS_API_KEY="<your key>"
Save and close.

Close your terminal and reopen (or use source command for bonus points)
Now you can access the environment variable like:
os.environ["ESPORTS_API_KEY"]

Don't let people steal your api keys!
"""

for n in range (300):
    tids = [] #Richie: don't need this since you're reassigning on next line
    tids = requests.get(f"http://api.esportsearnings.com/v0/LookupRecentTournaments?apikey=6fbda7c3f7cb1d84286157bdb8f3defc9433ac28eae09a95282cd2808dea2ec6&\
                        offset={offsetiter}"
                        ,verify = False).json()
    tidjson = tidjson.append(tids, ignore_index=True, sort=False)
    offsetiter+=100
    time.sleep(1) #API Restriction - Calls must be 1-second apart

In [None]:
tidjson.drop_duplicates(subset = "TournamentId", inplace=True)
tidjson = tidjson.astype(dtype = {"TournamentId" : "int64",
                                  "GameId" : "int64", 
                                  "TournamentName" : "str",
                                  "TotalUSDPrize" : "float"})
tidjson

In [None]:
"""
Richie:
Use set() instead of list() to get unique elements in list.
Once you do that, you won't even need a function.
New version below.

Another thought: this data is already in a dataframe so just use .unique()
tidjson['GameId'].unique().tolist()
"""

from types import List

def getUniqueGameIds(ids: List[int]) -> List[int]:
    return set(listinput)

def getUniqueGameIds(listinput):
    unique_list = []
    for x in listinput:
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

In [None]:
print(getUniqueGameIds(tidjson['GameId']))

In [None]:
#this cell will call the API and gather all of the game names by ID in a JSON file
gamejson = pd.DataFrame(columns = ['GameName', 'TotalUSDPrize', 'TotalTournaments', 'TotalPlayers'])

for gameid in getUniqueGameIds(tidjson['GameId']):
    gids = [] #Richie: can remove
    gids = requests.get(f"http://api.esportsearnings.com/v0/LookupGameById?apikey=6fbda7c3f7cb1d84286157bdb8f3defc9433ac28eae09a95282cd2808dea2ec6&\
                        gameid={gameid}"
                        ,verify = False).json()
    gamejson = gamejson.append(gids, ignore_index=True, sort=False)
    time.sleep(1)

In [None]:
gamejson.dropna(inplace=True)
gamejson = gamejson.astype(dtype = {"GameName" : "str", 
                                    "TotalUSDPrize" : "float",
                                    "TotalTournaments" : "int64",
                                    "TotalPlayers" : "int64"})

In [None]:
gamejson

In [6]:
gamejsonsorted = gamejson.sort_values(by=['TotalUSDPrize'], ascending=False, axis=0).head(10)
gamejsonsorted

NameError: name 'gamejson' is not defined

In [None]:
#lets see the most lucrative games here:

gamejsonsorted.iplot(kind='bar', x='GameName', y='TotalUSDPrize', title='Most Lucrative ESports Games',
                    yTitle='Tournament Earnings in USD')

In [None]:
#what about the highest earning game per average prize pool? 

gamejson['AverageUSDPrize'] = gamejson['TotalUSDPrize']/gamejson['TotalTournaments']
gameavgsorted = gamejson.sort_values(by='AverageUSDPrize', ascending=False).head(10)

In [None]:
gameavgsorted.iplot(kind='bar', x='GameName', y='AverageUSDPrize', color='blue', yTitle='Average Winnings per Event',
                   xTitle='Game Name')

In [None]:
#here are the games with the biggest professional player bases
gamejsonplayer = gamejson.sort_values(by='TotalPlayers', ascending=False).head(10)
gamejsonplayer.iplot(kind='bar', x='GameName', y='TotalPlayers', title='Games with Biggest Professional Player Base',
                    yTitle='Number of Players', xTitle='Game Name', color='red')

In [None]:
"""
Richie:
Replace whole for loop with tidjson['StartDate'].apply(funname)

Functional style of programming is (usually) easier to read and allows for 
    SIMD (single instruction, multiple data) operations that can be (theoretically)
    run on GPU or other parallelized hardware. It's also just 5x faster than looping by itself.
    
There might also be a pandas/numpy built-in for SIMD date manipulation, but I'll leave that googling to you
"""

i = 0
tidjson['FDates'] = tidjson['StartDate']

for date in tidjson.StartDate:
    tidjson['FDates'][i] = dt.datetime.strftime(pd.to_datetime(tidjson['StartDate'].iloc[i]), '%Y-%m')
    i+=1
tidjson['FDates']

In [None]:
timeseriesdf = tidjson[['FDates','TotalUSDPrize']].sort_values(by='FDates', ascending=True)
grpdf = timeseriesdf.groupby('FDates').agg({'TotalUSDPrize':'sum'})
grpdf = grpdf.reset_index()

px.scatter(grpdf, x="FDates", y="TotalUSDPrize")
#, animation_frame="FDates", log_x=True

In [None]:
#cumulative histogram from plotly (py/go, not px)
fig = go.Figure()

fig.add_trace(go.Histogram(x=grpdf['FDates'], y=grpdf['TotalUSDPrize'], histfunc="sum", cumulative_enabled=True))

fig.add_annotation(
    go.layout.Annotation(
    x=grpdf['FDates'].max(),
    y=grpdf['TotalUSDPrize'].sum().max(),
    xref = "x",
    yref = "y",
    text='ESports is rapidly approaching <br>>$1B USD paid out in winnings alone',
    align="left",
    font = dict(
            size=13,
            color='red')
    )
)

fig.update_layout(
    title_text = 'Cumulative Total of ESports Prize Money by Year',
    xaxis_title_text = "Year of Tournament",
    yaxis_title_text = "Total Prize Money in USD"
)

fig.show()

In [None]:
#this cell should call API for highest earning players (is one of the API modes)
#this grabs top 100 earners
bigmoney = pd.DataFrame(columns = ['PlayerId', 'NameFirst', 'NameLast' , 'CurrentHandle', 'CountryCode', 'TotalUSDPrize'])

bigmoneydata = requests.get("https://api.esportsearnings.com/v0/LookupHighestEarningPlayers?\
    apikey=6fbda7c3f7cb1d84286157bdb8f3defc9433ac28eae09a95282cd2808dea2ec6", verify=False).json()
time.sleep(1)

bigmoney = bigmoney.append(bigmoneydata, ignore_index=True, sort=True)

In [None]:
bigmoney.head(15)

In [None]:
bigmoney.head(50).iplot(kind='bar', x='CurrentHandle', y='TotalUSDPrize', yTitle='Total Career Winnings', 
                       xTitle='Competitor Handle')

In [None]:
bigmoney.head(3).iplot(kind='barh', color='green', x='CurrentHandle', y='TotalUSDPrize', title='The Highest Tournament Winning Earners in ESports',
                      xTitle='Total Career Winnings', yTitle='Competitor Handle')


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [None]:
#throwing process into func in case we want to use different columns/data
"""
Richie: this works for lab code, but I'd recommend wrapping this in
    a class (or just use a param) to achieve the above goal.
"""
def plot_20_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:20]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 

    plt.bar(x_pos, counts,align='center')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('Word')
    plt.ylabel('Count')
    plt.title('20 Most Common Words in Tournament Names')
    plt.show()


In [None]:
count_vectorizer = CountVectorizer(stop_words='english')

count_data = count_vectorizer.fit_transform(tidjson['TournamentName'])

plot_20_most_common_words(count_data, count_vectorizer)

In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
def print_names(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nImaginary Tournament Name #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
number_topics = 10
number_words = 5

#Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Generated Tournament Names Made by LDA:")
print_names(lda, count_vectorizer, number_words)

### Wow, none of these make any sense