# This file is used to scrape for wrestler data.

In [2]:
import sys
!{sys.executable} -m pip install lxml 
!{sys.executable} -m pip install html5lib  



In [3]:
import matplotlib as mpltlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime


import requests, re, json
from bs4 import BeautifulSoup
import math

import rank_helper
import kimarite_helper

In [4]:
def get_df_by_id(id):
    
    URL = f"http://sumodb.sumogames.de/Rikishi.aspx?r={id}"
    basepage = requests.get(URL)
    soup = BeautifulSoup(basepage.content, 'html.parser')
    
    try:
        highest = soup.find('table', class_='rikishidata')
        highest = highest.find('table', class_='rikishidata')#.find('tr').findAll('td')[1].contents[0]
        highest = highest.find('tr').findAll('td')[1].contents[0]
        highest = str(highest.split()[0]).replace('-', '')
    except:
        print(f"{id} skipped prelim")
        return
    
    
    if not rank_helper.rank_is_top(highest, True):
        print(f"{id} rejected")
        return 


    get_data = soup.find_all('tr')


    # texts that we have to convert, since it's an image in the table
    txt1 = '<img border="0" src="img/hoshi_shiro.gif"/>'     # win, white circle
    txt2 = '<img border="0" src="img/hoshi_yasumi.gif"/>'    # withdrawal, dash
    txt3 = '<img border="0" src="img/hoshi_kuro.gif"/>'      # loss, black circle
    txt4 = '<img border="0" src="img/hoshi_fusenpai.gif"/>'  # ff, black square
    txt5 = '<img border="0" src="img/hoshi_fusensho.gif"/>' #hoshi_fusensho # withdrawal, white square


    # convert the images to W(win) F(forfeit) L(loss)
    edited = str(get_data).replace(txt1, 'W')
    edited = edited.replace(txt2, 'F')
    edited = edited.replace(txt3, 'L')
    edited = edited.replace(txt4, 'A') # absence 
    edited = edited.replace(txt5, 'W') # enemy ff counts as win


    # put the record table into a dataframe
    test = pd.read_html(edited)
    df = test[2]

    # rename columns
    df.columns = ['Date','Rank', 'Series','W_L','Yusho_standing', 'Height/weight']
    #df['Series'] = df['Series'].replace(np.nan,"")
    # if there is no series, there is no relevant data here most likely (record too old, maezumo)
    df.dropna(subset = ["Series"], inplace=True)

    # drop any possible name banner columns
    df = df[df.W_L.apply(lambda x: '-' in x)]

    # convert dates to string, then datetime

    #df[:, 'Date'] = df["Date"].astype(str)
    df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')

    # we don't really care about yusho standing and height/weight. drop them
    # df['Yusho_standing'] = df['Yusho_standing'].replace(np.nan,"")
    df = df.drop(columns=['Yusho_standing', 'Height/weight'])



    # calculate streaks
    # divide games into 5 sections, if any of the sectios contains W, then
    #series = df['Series'].values

    forfeit_list = []
    win_list = []
    lose_list = []
    ab_list = []
    streak_list = []

    for index, row in df.iterrows():
        f_count = 0
        w_count = 0
        l_count = 0
        a_count = 0
        row_series = row['Series']


        # get W/L/F/A by counting
        for s in row_series:
            if(s == 'F'):
                f_count += 1 
            if(s == 'W'):
                w_count += 1
            if(s == 'L'):
                l_count += 1
            if(s == 'A'):
                a_count += 1

        forfeit_list.append(f_count)
        win_list.append(w_count)
        lose_list.append(l_count)
        ab_list.append(a_count)

        # game is a streak when you win 4 or more games in a row
        streak_list.append(sum(1 for x in re.finditer(r'W{4,}', row['Series'])))






    # make new columns
    df['Win'] = win_list
    df['Loss'] = lose_list
    df['Forfeit'] = forfeit_list
    df['Absence'] = ab_list



    # calculate win rate: (W/# games played), ignoring the games forfeited
    df['Winrate'] = (df['Win']/(15 - (df['Forfeit'] + df['Absence'])))*100
    # remove nans
    df['Winrate'] = df['Winrate'].replace(np.nan,0)
    # round the numbers to the 2 decimal place
    df['Winrate'] = df['Winrate'].round(2)


    # change streak column to Y or N
    # might be a good move to limit the winrate here as well?
    change = []
    for s in streak_list:
        if(s > 0):
            change.append('Y')
        else:
            change.append('N')


    df['Streak(Wins)'] = change
    df.insert(0, 'id', id)
        

    

    # only get the top division tournaments
    return df
        
        

In [7]:
df = get_df_by_id(2933)#.append(get_df_by_id(1), ignore_index=True) 
df.shape

(0, 11)

In [37]:
x = df.loc[df['Streak(Wins)'] == 'Y']
x.head()

mean_wr = x['Winrate'].mean()
mean_wr

# average win rate is 65.91%



test_mean = df['Streak(Wins)'].value_counts().Y
test_mean

# 53 tournaments are streaky



streaky_games = test_mean / len(df)
streaky_games

# 50.48% of the tournaments are streaky, which is good
# but we might have to consider analyzing amount of forfeit/absence numbers, 
# as it could be an interaction term for thrusters vs streakiness

0.6103896103896104

<br>

# DO NOT RUN THIS UNLESS YOU ARE SCRAPING!

In [16]:
# up to 12670
# 2933 empty test case

# for problematic pages
e = []

# go through 0, 13 for everyone
for batch in range(12, 13):
    
    aggregate_df = pd.DataFrame()
    
    for i in range(1 + 1000 * batch, 1000 * (batch+1)):
        print(f"Processing {i}...")
        next = None
        try:
            next = get_df_by_id(i)
        except:
            e.append(i)
        aggregate_df = aggregate_df.append(next, ignore_index=True)

        #df = df[df.Rank.apply(lambda x: rank_helper.rank_is_top(x))]
    
    aggregate_df.to_csv(f'tournaments_all_{batch}.csv', index = False)



Processing 12001...
12001 rejected
Processing 12002...
12002 rejected
Processing 12003...
12003 rejected
Processing 12004...
12004 rejected
Processing 12005...
12005 rejected
Processing 12006...
12006 rejected
Processing 12007...
12007 rejected
Processing 12008...
12008 rejected
Processing 12009...
12009 rejected
Processing 12010...
12010 rejected
Processing 12011...
12011 rejected
Processing 12012...
12012 rejected
Processing 12013...
12013 rejected
Processing 12014...
12014 rejected
Processing 12015...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12016...
12016 rejected
Processing 12017...
12017 rejected
Processing 12018...
12018 rejected
Processing 12019...
12019 rejected
Processing 12020...
12020 rejected
Processing 12021...
12021 rejected
Processing 12022...
12022 rejected
Processing 12023...
12023 rejected
Processing 12024...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12025...
12025 rejected
Processing 12026...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12027...
12027 rejected
Processing 12028...
12028 rejected
Processing 12029...
12029 rejected
Processing 12030...
12030 rejected
Processing 12031...
12031 rejected
Processing 12032...
12032 rejected
Processing 12033...
12033 rejected
Processing 12034...
12034 rejected
Processing 12035...
12035 rejected
Processing 12036...
12036 rejected
Processing 12037...
12037 rejected
Processing 12038...
12038 rejected
Processing 12039...
12039 rejected
Processing 12040...
Processing 12041...
12041 rejected
Processing 12042...
12042 rejected
Processing 12043...
Processing 12044...
12044 rejected
Processing 12045...
12045 rejected
Processing 12046...
12046 rejected
Processing 12047...
12047 rejected
Processing 12048...
12048 rejected
Processing 12049...
12049 rejected
Processing 12050...
12050 rejected
Processing 12051...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12052...
12052 rejected
Processing 12053...
12053 rejected
Processing 12054...
12054 rejected
Processing 12055...
Processing 12056...
12056 rejected
Processing 12057...
12057 rejected
Processing 12058...
12058 rejected
Processing 12059...
12059 rejected
Processing 12060...
12060 rejected
Processing 12061...
12061 rejected
Processing 12062...
12062 rejected
Processing 12063...
12063 rejected
Processing 12064...
12064 rejected
Processing 12065...
12065 rejected
Processing 12066...
12066 rejected
Processing 12067...
12067 rejected
Processing 12068...
12068 rejected
Processing 12069...
12069 rejected
Processing 12070...
12070 rejected
Processing 12071...
12071 rejected
Processing 12072...
12072 rejected
Processing 12073...
12073 rejected
Processing 12074...
12074 rejected
Processing 12075...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12076...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12077...
Processing 12078...
12078 rejected
Processing 12079...
12079 rejected
Processing 12080...
12080 rejected
Processing 12081...
12081 rejected
Processing 12082...
12082 rejected
Processing 12083...
12083 rejected
Processing 12084...
12084 rejected
Processing 12085...
12085 rejected
Processing 12086...
12086 rejected
Processing 12087...
12087 rejected
Processing 12088...
12088 rejected
Processing 12089...
12089 rejected
Processing 12090...
12090 rejected
Processing 12091...
12091 rejected
Processing 12092...
12092 rejected
Processing 12093...
12093 rejected
Processing 12094...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12095...
12095 rejected
Processing 12096...
12096 rejected
Processing 12097...
12097 rejected
Processing 12098...
12098 rejected
Processing 12099...
12099 rejected
Processing 12100...
12100 rejected
Processing 12101...
12101 rejected
Processing 12102...
12102 rejected
Processing 12103...
12103 rejected
Processing 12104...
12104 rejected
Processing 12105...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12106...
12106 rejected
Processing 12107...
Processing 12108...
12108 rejected
Processing 12109...
12109 rejected
Processing 12110...
12110 rejected
Processing 12111...
12111 rejected
Processing 12112...
12112 rejected
Processing 12113...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12114...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12115...
12115 rejected
Processing 12116...
12116 rejected
Processing 12117...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12118...
12118 rejected
Processing 12119...
12119 rejected
Processing 12120...
12120 rejected
Processing 12121...
12121 rejected
Processing 12122...
12122 rejected
Processing 12123...
12123 rejected
Processing 12124...
12124 rejected
Processing 12125...
12125 rejected
Processing 12126...
12126 rejected
Processing 12127...
12127 rejected
Processing 12128...
12128 rejected
Processing 12129...
12129 rejected
Processing 12130...
Processing 12131...
12131 rejected
Processing 12132...
12132 rejected
Processing 12133...
12133 rejected
Processing 12134...
12134 rejected
Processing 12135...
12135 rejected
Processing 12136...
12136 rejected
Processing 12137...
12137 rejected
Processing 12138...
12138 rejected
Processing 12139...
12139 rejected
Processing 12140...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12141...
12141 rejected
Processing 12142...
12142 rejected
Processing 12143...
12143 rejected
Processing 12144...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12145...
12145 rejected
Processing 12146...
12146 rejected
Processing 12147...
12147 rejected
Processing 12148...
12148 rejected
Processing 12149...
12149 rejected
Processing 12150...
12150 rejected
Processing 12151...
12151 rejected
Processing 12152...
12152 rejected
Processing 12153...
12153 rejected
Processing 12154...
12154 rejected
Processing 12155...
12155 rejected
Processing 12156...
12156 rejected
Processing 12157...
12157 rejected
Processing 12158...
12158 rejected
Processing 12159...
12159 rejected
Processing 12160...
12160 rejected
Processing 12161...
12161 rejected
Processing 12162...
12162 rejected
Processing 12163...
12163 rejected
Processing 12164...
12164 rejected
Processing 12165...
12165 rejected
Processing 12166...
12166 rejected
Processing 12167...
12167 rejected
Processing 12168...
12168 rejected
Processing 12169...
12169 rejected
Processing 12170...
12170 rejected
Processing 12171...
12171 rejected
Processing 12172...
12172 rejected
Processing 12173...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12192...
12192 rejected
Processing 12193...
12193 rejected
Processing 12194...
12194 rejected
Processing 12195...
12195 rejected
Processing 12196...
12196 rejected
Processing 12197...
12197 rejected
Processing 12198...
12198 rejected
Processing 12199...
12199 rejected
Processing 12200...
12200 rejected
Processing 12201...
12201 rejected
Processing 12202...
12202 rejected
Processing 12203...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12204...
12204 rejected
Processing 12205...
12205 rejected
Processing 12206...
12206 rejected
Processing 12207...
12207 rejected
Processing 12208...
12208 rejected
Processing 12209...
12209 rejected
Processing 12210...
Processing 12211...
12211 rejected
Processing 12212...
12212 rejected
Processing 12213...
12213 rejected
Processing 12214...
12214 rejected
Processing 12215...
12215 rejected
Processing 12216...
12216 rejected
Processing 12217...
12217 rejected
Processing 12218...
12218 rejected
Processing 12219...
12219 rejected
Processing 12220...
12220 rejected
Processing 12221...
12221 rejected
Processing 12222...
12222 rejected
Processing 12223...
12223 rejected
Processing 12224...
12224 rejected
Processing 12225...
12225 rejected
Processing 12226...
Processing 12227...
12227 rejected
Processing 12228...
12228 rejected
Processing 12229...
12229 rejected
Processing 12230...
12230 rejected
Processing 12231...
Processing 12232...
12232 rejected
Processing 12233...
12233 reje

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12240...
12240 rejected
Processing 12241...
12241 rejected
Processing 12242...
12242 rejected
Processing 12243...
12243 rejected
Processing 12244...
12244 rejected
Processing 12245...
12245 rejected
Processing 12246...
12246 rejected
Processing 12247...
12247 rejected
Processing 12248...
12248 rejected
Processing 12249...
12249 rejected
Processing 12250...
12250 rejected
Processing 12251...
12251 rejected
Processing 12252...
12252 rejected
Processing 12253...
12253 rejected
Processing 12254...
12254 rejected
Processing 12255...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12256...
12256 rejected
Processing 12257...
12257 rejected
Processing 12258...
12258 rejected
Processing 12259...
12259 rejected
Processing 12260...
12260 rejected
Processing 12261...
12261 rejected
Processing 12262...
12262 rejected
Processing 12263...
12263 rejected
Processing 12264...
12264 rejected
Processing 12265...
12265 rejected
Processing 12266...
12266 rejected
Processing 12267...
12267 rejected
Processing 12268...
12268 rejected
Processing 12269...
12269 rejected
Processing 12270...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12271...
12271 rejected
Processing 12272...
12272 rejected
Processing 12273...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12274...
12274 rejected
Processing 12275...
12275 rejected
Processing 12276...
12276 rejected
Processing 12277...
12277 rejected
Processing 12278...
12278 rejected
Processing 12279...
12279 rejected
Processing 12280...
12280 rejected
Processing 12281...
12281 rejected
Processing 12282...
12282 rejected
Processing 12283...
12283 rejected
Processing 12284...
12284 rejected
Processing 12285...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12286...
12286 rejected
Processing 12287...
12287 rejected
Processing 12288...
12288 rejected
Processing 12289...
12289 rejected
Processing 12290...
12290 rejected
Processing 12291...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12292...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12293...
12293 rejected
Processing 12294...
12294 rejected
Processing 12295...
12295 rejected
Processing 12296...
12296 rejected
Processing 12297...
12297 rejected
Processing 12298...
12298 rejected
Processing 12299...
12299 rejected
Processing 12300...
12300 rejected
Processing 12301...
12301 rejected
Processing 12302...
12302 rejected
Processing 12303...
12303 rejected
Processing 12304...
12304 rejected
Processing 12305...
12305 rejected
Processing 12306...
12306 rejected
Processing 12307...
12307 rejected
Processing 12308...
12308 rejected
Processing 12309...
12309 rejected
Processing 12310...
12310 rejected
Processing 12311...
12311 rejected
Processing 12312...
12312 rejected
Processing 12313...
12313 rejected
Processing 12314...
12314 rejected
Processing 12315...
12315 rejected
Processing 12316...
12316 rejected
Processing 12317...
12317 rejected
Processing 12318...
12318 rejected
Processing 12319...
12319 rejected
Processing 12320...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12321...
12321 rejected
Processing 12322...
12322 rejected
Processing 12323...
12323 rejected
Processing 12324...
12324 rejected
Processing 12325...
12325 rejected
Processing 12326...
12326 skipped prelim
Processing 12327...
12327 skipped prelim
Processing 12328...
12328 skipped prelim
Processing 12329...
12329 skipped prelim
Processing 12330...
12330 skipped prelim
Processing 12331...
12331 skipped prelim
Processing 12332...
12332 rejected
Processing 12333...
12333 rejected
Processing 12334...
12334 rejected
Processing 12335...
12335 rejected
Processing 12336...
12336 rejected
Processing 12337...
12337 rejected
Processing 12338...
12338 rejected
Processing 12339...
12339 rejected
Processing 12340...
12340 rejected
Processing 12341...
12341 rejected
Processing 12342...
12342 rejected
Processing 12343...
12343 rejected
Processing 12344...
12344 rejected
Processing 12345...
12345 rejected
Processing 12346...
12346 rejected
Processing 12347...
12347 rejected
Processing 12348...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12352...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12353...
12353 rejected
Processing 12354...
12354 rejected
Processing 12355...
12355 rejected
Processing 12356...
12356 rejected
Processing 12357...
12357 rejected
Processing 12358...
12358 rejected
Processing 12359...
12359 rejected
Processing 12360...
12360 rejected
Processing 12361...
12361 rejected
Processing 12362...
Processing 12363...
12363 rejected
Processing 12364...
12364 rejected
Processing 12365...
12365 rejected
Processing 12366...
12366 rejected
Processing 12367...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12368...
12368 rejected
Processing 12369...
12369 rejected
Processing 12370...
Processing 12371...
12371 rejected
Processing 12372...
12372 rejected
Processing 12373...
12373 rejected
Processing 12374...
12374 rejected
Processing 12375...
12375 rejected
Processing 12376...
12376 rejected
Processing 12377...
12377 rejected
Processing 12378...
12378 rejected
Processing 12379...
12379 rejected
Processing 12380...
12380 rejected
Processing 12381...
12381 rejected
Processing 12382...
12382 rejected
Processing 12383...
12383 rejected
Processing 12384...
12384 rejected
Processing 12385...
12385 rejected
Processing 12386...
12386 rejected
Processing 12387...
12387 rejected
Processing 12388...
12388 rejected
Processing 12389...
12389 rejected
Processing 12390...
12390 rejected
Processing 12391...
12391 rejected
Processing 12392...
12392 rejected
Processing 12393...
12393 rejected
Processing 12394...
12394 rejected
Processing 12395...
12395 rejected
Processing 12396...
12396 rejected


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12413...
12413 rejected
Processing 12414...
12414 rejected
Processing 12415...
12415 rejected
Processing 12416...
12416 rejected
Processing 12417...
12417 rejected
Processing 12418...
12418 rejected
Processing 12419...
12419 rejected
Processing 12420...
12420 rejected
Processing 12421...
12421 rejected
Processing 12422...
12422 rejected
Processing 12423...
12423 rejected
Processing 12424...
12424 rejected
Processing 12425...
Processing 12426...
12426 rejected
Processing 12427...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12428...
12428 rejected
Processing 12429...
12429 rejected
Processing 12430...
12430 rejected
Processing 12431...
12431 rejected
Processing 12432...
12432 rejected
Processing 12433...
12433 rejected
Processing 12434...
12434 rejected
Processing 12435...
12435 rejected
Processing 12436...
12436 rejected
Processing 12437...
12437 rejected
Processing 12438...
12438 rejected
Processing 12439...
12439 rejected
Processing 12440...
12440 rejected
Processing 12441...
12441 rejected
Processing 12442...
12442 rejected
Processing 12443...
12443 rejected
Processing 12444...
12444 rejected
Processing 12445...
12445 rejected
Processing 12446...
12446 rejected
Processing 12447...
12447 rejected
Processing 12448...
12448 rejected
Processing 12449...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12450...
12450 rejected
Processing 12451...
Processing 12452...
12452 rejected
Processing 12453...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12454...
12454 rejected
Processing 12455...
12455 rejected
Processing 12456...
12456 rejected
Processing 12457...
12457 rejected
Processing 12458...
12458 rejected
Processing 12459...
12459 rejected
Processing 12460...
12460 rejected
Processing 12461...
12461 rejected
Processing 12462...
12462 rejected
Processing 12463...
12463 rejected
Processing 12464...
12464 rejected
Processing 12465...
12465 rejected
Processing 12466...
12466 rejected
Processing 12467...
12467 rejected
Processing 12468...
12468 rejected
Processing 12469...
12469 rejected
Processing 12470...
Processing 12471...
12471 rejected
Processing 12472...
12472 rejected
Processing 12473...
12473 rejected
Processing 12474...
12474 rejected
Processing 12475...
12475 rejected
Processing 12476...
12476 rejected
Processing 12477...
12477 rejected
Processing 12478...
12478 rejected
Processing 12479...
12479 rejected
Processing 12480...
12480 rejected
Processing 12481...
12481 rejected
Processing 12482...
12482 rejected


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'].astype(str), format='%Y.%m')


Processing 12576...
12576 rejected
Processing 12577...
12577 rejected
Processing 12578...
12578 rejected
Processing 12579...
12579 rejected
Processing 12580...
12580 rejected
Processing 12581...
12581 rejected
Processing 12582...
12582 rejected
Processing 12583...
12583 rejected
Processing 12584...
12584 rejected
Processing 12585...
12585 rejected
Processing 12586...
12586 rejected
Processing 12587...
12587 rejected
Processing 12588...
12588 rejected
Processing 12589...
12589 rejected
Processing 12590...
12590 rejected
Processing 12591...
12591 rejected
Processing 12592...
12592 rejected
Processing 12593...
12593 rejected
Processing 12594...
12594 rejected
Processing 12595...
12595 rejected
Processing 12596...
12596 rejected
Processing 12597...
12597 rejected
Processing 12598...
12598 rejected
Processing 12599...
12599 rejected
Processing 12600...
12600 rejected
Processing 12601...
12601 rejected
Processing 12602...
12602 rejected
Processing 12603...
12603 rejected
Processing 12604...


12795 skipped prelim
Processing 12796...
12796 skipped prelim
Processing 12797...
12797 skipped prelim
Processing 12798...
12798 skipped prelim
Processing 12799...
12799 skipped prelim
Processing 12800...
12800 skipped prelim
Processing 12801...
12801 skipped prelim
Processing 12802...
12802 skipped prelim
Processing 12803...
12803 skipped prelim
Processing 12804...
12804 skipped prelim
Processing 12805...
12805 skipped prelim
Processing 12806...
12806 skipped prelim
Processing 12807...
12807 skipped prelim
Processing 12808...
12808 skipped prelim
Processing 12809...
12809 skipped prelim
Processing 12810...
12810 skipped prelim
Processing 12811...
12811 skipped prelim
Processing 12812...
12812 skipped prelim
Processing 12813...
12813 skipped prelim
Processing 12814...
12814 skipped prelim
Processing 12815...
12815 skipped prelim
Processing 12816...
12816 skipped prelim
Processing 12817...
12817 skipped prelim
Processing 12818...
12818 skipped prelim
Processing 12819...
12819 skipped pr

12995 skipped prelim
Processing 12996...
12996 skipped prelim
Processing 12997...
12997 skipped prelim
Processing 12998...
12998 skipped prelim
Processing 12999...
12999 skipped prelim


In [17]:
aggregate_df 

Unnamed: 0,id,Date,Rank,Series,W_L,Win,Loss,Forfeit,Absence,Winrate,Streak(Wins)
0,12015,2012-05-01,Jk8e,FWLFFLLFWFWFWFF,4-3,4,3,8,0,57.14,N
1,12015,2012-07-01,Jd76e,WFFWFWWFFLWFFWF,6-1,6,1,8,0,85.71,N
2,12015,2012-09-01,Jd5w,FLFWFWFWLFWFFLF,4-3,4,3,8,0,57.14,N
3,12015,2012-11-01,Sd86w,FFFFFFFFFFFFFFF,0-0-7,0,0,15,0,0.00,N
4,12015,2013-01-01,Jd47e,LFWFFWWFLFFWFWF,5-2,5,2,8,0,71.43,N
...,...,...,...,...,...,...,...,...,...,...,...
1586,12575,2020-07-01,Ms10w,WFLFFWFWWFWFFFW,6-1,6,1,8,0,85.71,N
1587,12575,2020-09-01,Ms2w,FFFFFFFFFFFFFFF,0-0-7,0,0,15,0,0.00,N
1588,12575,2020-11-01,Ms2w,FLWFWFWFLFLFWFF,4-3,4,3,8,0,57.14,N
1589,12575,2021-01-01,J14w,LWWLLWWLLWWLWWL,8-7,8,7,0,0,53.33,N


In [18]:
# if scraping occured, merge all files
aggregate_df = pd.DataFrame()
for i in range(0, 13):
    if i != 10: # there is nothing in this file.
        df = pd.read_csv(f'tournaments_all_{i}.csv')
        aggregate_df = aggregate_df.append(df)
        
#aggregate_df = aggregate_df.groupby('id').filter(lambda x : len(x) > 5)
aggregate_df.to_csv(f'tournaments_all.csv', index = False)

aggregate_df.shape

(67350, 11)

In [23]:
df1 = pd.read_csv(f'tournaments_all.csv')
df2 = pd.read_csv(f'tournaments_all_copy.csv')

df = df1.append(df2)
df = df.drop_duplicates()
df.to_csv(f'tournaments_all_final.csv', index = False)


***

In [28]:
# load the saved tournament df
df = pd.read_csv(f'tournaments_all_final.csv')
print(df.shape)


# example of selecting only the top two divisions
df = df[df.Rank.apply(lambda x: rank_helper.rank_is_top(x))]
print(df.shape)

(67351, 11)
(34995, 11)


In [29]:
# get the push score (m), push win% (p), and number of wins (n) of a given wrestler by their ID.
def get_m_by_id(id):
    
    kimarite_URL = f"http://sumodb.sumogames.de/Rikishi_kim.aspx?r={id}"
    print(f"processing {id}...")
    basepage = requests.get(kimarite_URL)
    soup = BeautifulSoup(basepage.content, 'html.parser')

    # get winning data table 
    kimarite_wins = soup.find('table', class_="ro_left")

    # this gets all winning moves in like [<tr><td class="rb_kim">oshidashi</td><td class="right"><a href="#0oshidashi">143</a></td></tr>, ...]
    cells = kimarite_wins.find_all('tr')

    kimarite_labels, kimarite_amts = [], []

    for cell in cells:
        win_name_tag, win_amt_tag = cell.find_all('td')
        kimarite_labels.append(win_name_tag.text.strip())
        kimarite_amts.append(int(win_amt_tag.text.strip()))


    # at this point we have all kimarite. reduce it so that only moves with >= 5% significance are displayed
    up_to_index = 0
    total_wins = sum(kimarite_amts)
    for i in range(0, len(kimarite_amts)):
        amt = kimarite_amts[i]
        # we find the most significant kimarite under 10%
        if amt / total_wins < 0.05:
            up_to_index = i
            break

    # we now slice off everything up to and past this point, label it as 'other'
    kimarite_labels = kimarite_labels[:up_to_index] + ["other"]
    kimarite_amts = kimarite_amts[:up_to_index] + [sum(kimarite_amts[up_to_index:])]

    #print(kimarite_labels)
    #print(kimarite_amts)

    #print((shikona, kimarite_amts, kimarite_labels))
    
    
    n = sum(kimarite_amts)
    if n == 0:
        return (None, None, None)
        
    w = 0 
    for i in range(0, len(kimarite_amts)):
        w += kimarite_amts[i] if kimarite_helper.is_oshi(kimarite_labels[i]) else 0
    
    p = w / n
    m = (0.5 - p)  * math.log10(n)
    return (m, p, n)



In [30]:
# filter out the wrestlers with less than five tournaments
#df = df.groupby('id').filter(lambda x : len(x) > 5)

stats_df = pd.DataFrame(columns = ["id", "m", "p", "n"])
d = []

for i in df.id.unique():
    
    m, p, n = get_m_by_id(i)
    if m:
        stats_df = stats_df.append({'id': i, 'm': m, 'p': p, 'n': n}, ignore_index=True)
    else:
        d.append(i)


stats_df

processing 1...
processing 2...
processing 3...
processing 4...
processing 5...
processing 6...
processing 7...
processing 8...
processing 9...
processing 10...
processing 11...
processing 12...
processing 13...
processing 14...
processing 15...
processing 16...
processing 17...
processing 18...
processing 19...
processing 20...
processing 21...
processing 22...
processing 23...
processing 24...
processing 25...
processing 26...
processing 27...
processing 28...
processing 29...
processing 30...
processing 31...
processing 32...
processing 33...
processing 34...
processing 35...
processing 36...
processing 37...
processing 38...
processing 39...
processing 40...
processing 41...
processing 42...
processing 43...
processing 44...
processing 45...
processing 46...
processing 47...
processing 48...
processing 49...
processing 50...
processing 51...
processing 52...
processing 53...
processing 54...
processing 55...
processing 56...
processing 57...
processing 58...
processing 59...
proces

processing 3690...
processing 3691...
processing 3692...
processing 3693...
processing 3694...
processing 3695...
processing 3696...
processing 3697...
processing 3698...
processing 3699...
processing 3700...
processing 3701...
processing 3702...
processing 3703...
processing 3704...
processing 3705...
processing 3706...
processing 3707...
processing 3708...
processing 3709...
processing 3710...
processing 3711...
processing 3712...
processing 3713...
processing 3714...
processing 3715...
processing 3716...
processing 3717...
processing 3718...
processing 3719...
processing 3720...
processing 3721...
processing 3722...
processing 3723...
processing 3724...
processing 3725...
processing 3726...
processing 3727...
processing 3728...
processing 3729...
processing 3730...
processing 3731...
processing 3732...
processing 3733...
processing 3734...
processing 3735...
processing 3736...
processing 3737...
processing 3738...
processing 3739...
processing 3740...
processing 3741...
processing 3

processing 4123...
processing 4124...
processing 4125...
processing 4126...
processing 4127...
processing 4128...
processing 4129...
processing 4130...
processing 4131...
processing 4132...
processing 4133...
processing 4134...
processing 4135...
processing 4136...
processing 4137...
processing 4138...
processing 4139...
processing 4140...
processing 4141...
processing 4142...
processing 4143...
processing 4144...
processing 4145...
processing 4146...
processing 4147...
processing 4148...
processing 4149...
processing 4150...
processing 4151...
processing 4152...
processing 4153...
processing 4154...
processing 4155...
processing 4156...
processing 4157...
processing 4158...
processing 4159...
processing 4160...
processing 4161...
processing 4162...
processing 4163...
processing 4164...
processing 4165...
processing 4166...
processing 4167...
processing 4168...
processing 4169...
processing 4170...
processing 4171...
processing 4172...
processing 4173...
processing 4174...
processing 4

processing 6410...
processing 6411...
processing 6419...
processing 6422...
processing 6427...
processing 6429...
processing 6432...
processing 6463...
processing 6467...
processing 6468...
processing 6473...
processing 6480...
processing 6491...
processing 6497...
processing 6512...
processing 6527...
processing 6531...
processing 6552...
processing 6559...
processing 6572...
processing 6594...
processing 6596...
processing 6599...
processing 6600...
processing 6614...
processing 6620...
processing 6622...
processing 6642...
processing 6654...
processing 6753...
processing 6766...
processing 6767...
processing 6769...
processing 6770...
processing 6771...
processing 6928...
processing 6931...
processing 6977...
processing 6983...
processing 7109...
processing 7111...
processing 7115...
processing 7121...
processing 7125...
processing 7127...
processing 7138...
processing 7143...
processing 7153...
processing 7239...
processing 7240...
processing 8154...
processing 8899...
processing 8

Unnamed: 0,id,m,p,n
0,1.0,0.027626,0.490066,604.0
1,2.0,1.220098,0.075798,752.0
2,3.0,0.536245,0.302682,522.0
3,4.0,0.160073,0.444142,734.0
4,5.0,1.253818,0.058140,688.0
...,...,...,...,...
1075,12449.0,-0.099929,0.548673,113.0
1076,12451.0,0.707805,0.156522,115.0
1077,12453.0,-0.389437,0.702381,84.0
1078,12470.0,-0.700176,0.865854,82.0


In [32]:
stats_df.to_csv(f'stats_by_id.csv', index = False)

things to note
* only analyzing those who made it to the top division
* not analyzing old records (no proof for push/non-push)
* classifying pushers only by winning techniques

things to explore
* bouts in certain time periods
* bouts in top divisions only (sanyaku, makuuchi, etc.)

big questions
* should we deal with wrestlers like akebono differently? is the m score representative?
* body weight/size of wrestler as opposed to their style