In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/1983.csv')

In [3]:
df[df['day']==1]

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win
0,1983.01,1,4140,J13w,Chikubayama,0-1 (7-8),False,yorikiri,4306,Ms1e,Ofuji,1-0 (6-1),True
1,1983.01,1,4306,Ms1e,Ofuji,1-0 (6-1),True,yorikiri,4140,J13w,Chikubayama,0-1 (7-8),False
2,1983.01,1,1337,J12w,Tochitsukasa,1-0 (9-6),True,oshidashi,4323,J13e,Shiraiwa,0-1 (3-12),False
3,1983.01,1,4323,J13e,Shiraiwa,0-1 (3-12),False,oshidashi,1337,J12w,Tochitsukasa,1-0 (9-6),True
4,1983.01,1,4097,J12e,Tamakiyama,0-1 (8-7),False,yorikiri,4319,J11w,Harunafuji,1-0 (5-10),True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4605,1983.11,1,4107,M3w,Kaiki,0-1 (4-11),False,yorikiri,4080,Y2eHD,Kitanoumi,1-0 (11-4),True
4606,1983.11,1,1350,M3e,Onokuni,1-0 (10-5),True,yoritaoshi,1354,Y1w,Chiyonofuji,0-1 (14-1),False
4607,1983.11,1,1354,Y1w,Chiyonofuji,0-1 (14-1),False,yoritaoshi,1350,M3e,Onokuni,1-0 (10-5),True
4608,1983.11,1,1342,K1w,Asahifuji,0-1 (6-9),False,yorikiri,4104,Y1e,Takanosato,1-0 (13-2),True


In [4]:
TEST_RECORDS = {
    0:'1-0 (8-7)',
    1:'(12-3)',
    2:'0-9 (4-11)',
    3:'12-2 (13-2)',
    4:'3-12 (3-12)',
    5:'- (-)',
    6:'3-2',
    7:'10-1-2',
    8:'3-1 (3-1-12)',
    9:'2-10-1 (2-10-3)',
}

In [5]:
def absence_detect(record):
    if not record.find('-') == record.rfind('-'):
        return True
    return False

In [6]:
def test_absence():
    assert(absence_detect(TEST_RECORDS[7]) == True)
    assert(absence_detect(TEST_RECORDS[6]) == False)
test_absence()

In [7]:
def get_start_index(record):
    parentheses_index = record.find('(')
    if parentheses_index == 0:
        return 1
    return 0

In [8]:
def get_first_index(record):
    return record.find('-')

In [9]:
def get_last_index(record):
    return record.rfind('-')

In [10]:
def get_end_index(record):
    open_par_index = record.find('(')
    if open_par_index and open_par_index != -1:
        return open_par_index - 1
    close_par_index = record.find(')') #Only happens if opening parenthesis at index 0
    if close_par_index and close_par_index != -1:
        return close_par_index
    return len(record)

In [11]:
def test_indexes():
    assert(get_start_index(TEST_RECORDS[0]) == 0)
    assert(get_start_index(TEST_RECORDS[1]) == 1)
    assert(get_first_index(TEST_RECORDS[6]) == get_last_index(TEST_RECORDS[6]) == 1)
    assert(get_first_index(TEST_RECORDS[7]) == 2)
    assert(get_last_index(TEST_RECORDS[7]) == 4)
    assert(get_end_index(TEST_RECORDS[0]) == 3)
    assert(get_end_index(TEST_RECORDS[1]) == 5)
    assert(get_end_index(TEST_RECORDS[6]) == 3)
    assert(get_end_index(TEST_RECORDS[7]) == 6)
    assert(get_end_index(TEST_RECORDS[8]) == 3)
    assert(get_end_index(TEST_RECORDS[9]) == 6)
test_indexes()

In [12]:
def get_post_fight(record):
    start = get_start_index(record)
    end = get_end_index(record)
    post_fight = record[start:end]
    return post_fight

In [13]:
def test_post_fight():
    assert(get_post_fight(TEST_RECORDS[0]) == '1-0')
    assert(get_post_fight(TEST_RECORDS[1]) == '12-3')
    assert(get_post_fight(TEST_RECORDS[2]) == '0-9')
    assert(get_post_fight(TEST_RECORDS[3]) == '12-2')
    assert(get_post_fight(TEST_RECORDS[4]) == '3-12')
    assert(get_post_fight(TEST_RECORDS[6]) == '3-2')
    assert(get_post_fight(TEST_RECORDS[7]) == '10-1-2')
    assert(get_post_fight(TEST_RECORDS[8]) == '3-1')
    assert(get_post_fight(TEST_RECORDS[9]) == '2-10-1')
test_post_fight()

In [14]:
import functools

def nan_exception(func):                                                                         
    """ Decorator to return np.nan when exception is raised | func --> func """                                                                                               
    @functools.wraps(func)                                                                            
    def wrapper(*args, **kwargs):                                                                     
        try:                                                                                      
            wrapped_func = func(*args, **kwargs)                                                  
        except Exception as e:                                                                    
            wrapped_func = np.nan                                                                                                                          
        return wrapped_func                                                                           
    return wrapper

In [15]:
@nan_exception
def get_post_wins(record):
    record = get_post_fight(record)
    end_index = get_first_index(record)
    return int(record[:end_index])

In [16]:
def test_post_wins():
    assert(get_post_wins(TEST_RECORDS[0]) == 1)
    assert(get_post_wins(TEST_RECORDS[1]) == 12)
    assert(get_post_wins(TEST_RECORDS[2]) == 0)
    assert(get_post_wins(TEST_RECORDS[3]) == 12)
    assert(get_post_wins(TEST_RECORDS[4]) == 3)
    assert(np.isnan(get_post_wins(TEST_RECORDS[5])))
    assert(get_post_wins(TEST_RECORDS[6]) == 3)
    assert(get_post_wins(TEST_RECORDS[7]) == 10)
test_post_wins()

In [17]:
@nan_exception
def get_post_losses(record):
    record = get_post_fight(record)
    if absence_detect(record):
        start_index = get_first_index(record) + 1
        end_index = get_last_index(record)
        return int(record[start_index:end_index])
    start_index = get_first_index(record) + 1
    return int(record[start_index:])

In [18]:
def test_post_losses():
    assert(get_post_losses(TEST_RECORDS[0]) == 0)
    assert(get_post_losses(TEST_RECORDS[1]) == 3)
    assert(get_post_losses(TEST_RECORDS[2]) == 9)
    assert(get_post_losses(TEST_RECORDS[3]) == 2)
    assert(get_post_losses(TEST_RECORDS[4]) == 12)
    assert(np.isnan(get_post_losses(TEST_RECORDS[5])))
    assert(get_post_losses(TEST_RECORDS[6]) == 2)
    assert(get_post_losses(TEST_RECORDS[7]) == 1)
test_post_losses()

In [19]:
@nan_exception
def get_post_absences(record):
    record = get_post_fight(record)
    if absence_detect(record):
        start_index = get_last_index(record) + 1
        return int(record[start_index:])
    return 0

In [20]:
def test_post_absences():
    assert(get_post_absences(TEST_RECORDS[6]) == 0)
    assert(get_post_absences(TEST_RECORDS[7]) == 2)
    assert(get_post_absences(get_post_fight(TEST_RECORDS[8])) == 1)
    assert(get_post_absences(get_post_fight(TEST_RECORDS[9])) == 1)

In [21]:
df.loc[:,'post1_wins'] = df.loc[:,'rikishi1_result'].apply(get_post_wins)
df.loc[:,'post1_losses'] = df.loc[:,'rikishi1_result'].apply(get_post_losses)

In [22]:
df.loc[:,'post2_wins'] = df.loc[:,'rikishi2_result'].apply(get_post_wins)
df.loc[:,'post2_losses'] = df.loc[:,'rikishi2_result'].apply(get_post_losses)

In [23]:
df.loc[df.isnull()['post1_losses'],:]

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win,post1_wins,post1_losses,post2_wins,post2_losses


In [24]:
df.loc[df.isnull().any(axis=1),df.isnull().any(axis=0)]

In [25]:
df.loc[df.isnull().any(axis=1),:]

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win,post1_wins,post1_losses,post2_wins,post2_losses


In [26]:
df.loc[:,'post2_absences'] = df.loc[:,'rikishi2_result'].apply(get_post_absences)
df.loc[:,'post1_absences'] = df.loc[:,'rikishi1_result'].apply(get_post_absences)

In [27]:
df.loc[df.isnull().any(axis=1),df.isnull().any(axis=0)]

In [28]:
df.loc[df.isnull().any(axis=1),:]

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win,post1_wins,post1_losses,post2_wins,post2_losses,post2_absences,post1_absences


In [29]:
df

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win,post1_wins,post1_losses,post2_wins,post2_losses,post2_absences,post1_absences
0,1983.01,1,4140,J13w,Chikubayama,0-1 (7-8),False,yorikiri,4306,Ms1e,Ofuji,1-0 (6-1),True,0,1,1,0,0,0
1,1983.01,1,4306,Ms1e,Ofuji,1-0 (6-1),True,yorikiri,4140,J13w,Chikubayama,0-1 (7-8),False,1,0,0,1,0,0
2,1983.01,1,1337,J12w,Tochitsukasa,1-0 (9-6),True,oshidashi,4323,J13e,Shiraiwa,0-1 (3-12),False,1,0,0,1,0,0
3,1983.01,1,4323,J13e,Shiraiwa,0-1 (3-12),False,oshidashi,1337,J12w,Tochitsukasa,1-0 (9-6),True,0,1,1,0,0,0
4,1983.01,1,4097,J12e,Tamakiyama,0-1 (8-7),False,yorikiri,4319,J11w,Harunafuji,1-0 (5-10),True,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5483,1983.11,15,4104,Y1e,Takanosato,13-2,False,yorikiri,1354,Y1w,Chiyonofuji,14-1,True,13,2,14,1,0,0
5484,1983.11,16,1348,J4w,Jingaku,(11-4),False,sotogake,1378,J1w,Hoo,(11-4),True,11,4,11,4,0,0
5485,1983.11,16,1378,J1w,Hoo,(11-4),True,sotogake,1348,J4w,Jingaku,(11-4),False,11,4,11,4,0,0
5486,1983.11,16,1287,J12w,Konishiki,(11-4),False,yorikiri,1378,J1w,Hoo,(11-4),True,11,4,11,4,0,0
