# take clean data and process for exploration and ML predictions

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

Matplotlib is building the font cache; this may take a moment.


In [3]:
df = pd.read_csv('processed_ranks.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  350 non-null    int64  
 1   rank        350 non-null    object 
 2   name        350 non-null    object 
 3   pob         350 non-null    object 
 4   birthdate   350 non-null    object 
 5   height      350 non-null    float64
 6   weight      350 non-null    float64
 7   year        350 non-null    int64  
 8   month       350 non-null    int64  
 9   win         344 non-null    float64
 10  loss        344 non-null    float64
 11  absent      34 non-null     float64
 12  next_rank   266 non-null    object 
dtypes: float64(5), int64(3), object(5)
memory usage: 35.7+ KB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,rank,name,pob,birthdate,height,weight,year,month,win,loss,absent,next_rank
0,6,K1e,Abi,Saitama,04.05.1994,185.0,121.0,2020,1,7.0,8.0,,M4w
1,84,M4w,Abi,Saitama,04.05.1994,185.0,121.0,2020,3,6.0,9.0,,M5e
2,156,M5e,Abi,Saitama,04.05.1994,185.0,121.0,2020,7,9.0,6.0,,M14w
3,246,M14w,Abi,Saitama,04.05.1994,185.0,121.0,2020,9,11.0,4.0,,J11w
4,343,J11w,Abi,Saitama,04.05.1994,185.0,121.0,2020,11,8.0,7.0,,


# process ranks

ranks are a mix of categorical and numerical
- Y > O > S > K > M > J
- 1 > 2 > 3 > 4 ...
- E > W

In [9]:
def east_west(df_rank):
    """
    turn east west into binary
    """
    
    df_is_east = df_rank.str.contains('e')
    
    return df_is_east

east_west(df['rank'])
df['is_east'] = east_west(df['rank'])
df['next_is_east'] = east_west(df['next_rank'])
df.head()

Unnamed: 0.1,Unnamed: 0,rank,name,pob,birthdate,height,weight,year,month,win,loss,absent,next_rank,is_east,next_is_east
0,6,K1e,Abi,Saitama,04.05.1994,185.0,121.0,2020,1,7.0,8.0,,M4w,True,False
1,84,M4w,Abi,Saitama,04.05.1994,185.0,121.0,2020,3,6.0,9.0,,M5e,False,True
2,156,M5e,Abi,Saitama,04.05.1994,185.0,121.0,2020,7,9.0,6.0,,M14w,True,False
3,246,M14w,Abi,Saitama,04.05.1994,185.0,121.0,2020,9,11.0,4.0,,J11w,False,False
4,343,J11w,Abi,Saitama,04.05.1994,185.0,121.0,2020,11,8.0,7.0,,,False,


In [22]:
df['rank'].str[0]

0      K
1      M
2      M
3      M
4      J
      ..
345    M
346    M
347    M
348    M
349    M
Name: rank, Length: 350, dtype: object

In [29]:
def tier_rank(df_rank):
    """
    return ordinal ranks for Y, O, S, K, M, J
    """
    
    df_tier = df_rank.str[0]
    mapper = {
        'Y': 1,
        'O': 2,
        'S': 3,
        'K': 4,
        'M': 5,
        'J': 6
    }
    
    df_tier = df_tier.replace(mapper)
    
    return df_tier

df['tier_rank'] = tier_rank(df['rank'])
df['next_tier_rank'] = tier_rank(df['next_rank'])
df.head()

Unnamed: 0.1,Unnamed: 0,rank,name,pob,birthdate,height,weight,year,month,win,loss,absent,next_rank,is_east,next_is_east,tier_rank,next_tier_rank
0,6,K1e,Abi,Saitama,04.05.1994,185.0,121.0,2020,1,7.0,8.0,,M4w,True,False,4,5.0
1,84,M4w,Abi,Saitama,04.05.1994,185.0,121.0,2020,3,6.0,9.0,,M5e,False,True,5,5.0
2,156,M5e,Abi,Saitama,04.05.1994,185.0,121.0,2020,7,9.0,6.0,,M14w,True,False,5,5.0
3,246,M14w,Abi,Saitama,04.05.1994,185.0,121.0,2020,9,11.0,4.0,,J11w,False,False,5,6.0
4,343,J11w,Abi,Saitama,04.05.1994,185.0,121.0,2020,11,8.0,7.0,,,False,,6,


In [None]:
def tier_