# This file is used to visualize the wrestler and metric dataframes.

In [64]:
# plotting and data processing
import matplotlib as mpltlib
import matplotlib.pyplot as plt
import matplotlib.patches as  mpatches
import seaborn 
import numpy as np
import pandas as pd

# auxiliary 
import requests, re, json
from bs4 import BeautifulSoup
import datetime 
import math

# custom helpers
import rank_helper
import kimarite_helper

# ML tooling
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as smf


plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (15, 10)

In [65]:
df = pd.read_csv(f'tournaments_all_final.csv')

# remove empty tournaments 
df = df[df.Forfeit != 15.0]

In [73]:
stats_df = pd.read_csv(f'stats_by_id.csv')
upper = df[df.Rank.apply(lambda x: rank_helper.rank_is_top(x))]

stats_df = stats_df[stats_df.n > 40]
full = pd.merge(upper, stats_df, on="id")


<br>

# Analysis with pusher metric

In [76]:
def push_class_logistic(data):
    
    pusherID = [12191,11985,8,12094,12043,11855,12239,5944,11934,7240]
    nonID = [11927,1123,12451,11868,1111,12107,12412,6599,11728,878,4,1]
    
    for i, row in data.iterrows():
        if int(row['id']) in  pusherID:
            data.loc[i,"push_class_numeric"] = 0
        elif row['id'] in  nonID:
            data.loc[i,"push_class_numeric"] = 1

    df = data.loc[data['id'].isin(pusherID) | data['id'].isin(nonID)]
    print(df)

    #Training set
    train_feat = df["m"] 
    train_targ = df["push_class_numeric"].values
    train_feat = [[m] for m in train_feat]
    print(train_targ)
    
    #Test Set
    test_feat = data["m"]
    test_feat = [[m] for m in test_feat]
    test_targ = data["push_class_numeric"].values
    
    lr = LogisticRegression()
    lr.fit(train_feat,train_targ)
   # print(lr.predict(test_feat)

    y_km = lr.predict(test_feat)
    arr1 = ['Pusher' if i == 0 else 'Non-Pusher' for i in y_km]     
    return(arr1)
    
    
    
def push_class_kmeans(m_col):
    kmeans = KMeans(n_clusters = 2)
    
    arr = []
    for i in m_col:
        arr.append([0,i])
        
    y_km = kmeans.fit_predict(arr)
#     f = 
    print(kmeans.fit(arr).cluster_centers_)
    arr1 = ['Pusher' if i == 0 else 'Non-Pusher' for i in y_km]
  
    return(arr1)

"""

def push_class(score):
    if score < -0.2:
        return 'Full pusher'
    elif score < 0:
        return 'Moderate pusher'
    elif score < 0.2:
        return 'Moderate other'
    else:
        return 'Full other'
"""

"\n\ndef push_class(score):\n    if score < -0.2:\n        return 'Full pusher'\n    elif score < 0:\n        return 'Moderate pusher'\n    elif score < 0.2:\n        return 'Moderate other'\n    else:\n        return 'Full other'\n"

In [77]:

#stats_df['push_class'] = stats_df.apply(lambda row: pusher_class(row.m) )
stats_df['push_class_numeric'] = np.nan
stats_df['push_class_LogisticRegression'] = push_class_logistic(stats_df)
stats_df['push_class_KMeans'] = push_class_kmeans(stats_df['m'])


stats_df

           id         m         p       n  push_class_numeric  \
0         1.0  0.027626  0.490066   604.0                 1.0   
3         4.0  0.160073  0.444142   734.0                 1.0   
7         8.0 -0.711770  0.746442   773.0                 0.0   
157     878.0  1.010557  0.143068   678.0                 1.0   
174    1111.0  1.017117  0.151625   831.0                 1.0   
175    1123.0  1.004299  0.173028  1179.0                 1.0   
948    5944.0 -0.661831  0.734027   673.0                 0.0   
977    6599.0  1.039385  0.126645   608.0                 1.0   
1001   7240.0 -0.186435  0.569892   465.0                 0.0   
1012  11728.0  0.610894  0.274162   507.0                 1.0   
1020  11855.0 -0.356766  0.640580   345.0                 0.0   
1021  11868.0  0.584271  0.270115   348.0                 1.0   
1023  11927.0  1.045357  0.095855   386.0                 1.0   
1024  11934.0 -0.208165  0.580000   400.0                 0.0   
1031  11985.0 -0.565636  

Unnamed: 0,id,m,p,n,push_class_numeric,push_class_LogisticRegression,push_class_KMeans
0,1.0,0.027626,0.490066,604.0,1.0,Pusher,Non-Pusher
1,2.0,1.220098,0.075798,752.0,,Non-Pusher,Pusher
2,3.0,0.536245,0.302682,522.0,,Non-Pusher,Pusher
3,4.0,0.160073,0.444142,734.0,1.0,Non-Pusher,Non-Pusher
4,5.0,1.253818,0.058140,688.0,,Non-Pusher,Pusher
...,...,...,...,...,...,...,...
1075,12449.0,-0.099929,0.548673,113.0,,Pusher,Non-Pusher
1076,12451.0,0.707805,0.156522,115.0,1.0,Non-Pusher,Pusher
1077,12453.0,-0.389437,0.702381,84.0,,Pusher,Non-Pusher
1078,12470.0,-0.700176,0.865854,82.0,,Pusher,Non-Pusher


In [None]:
# note this is only a merge on the top two ranks
full = pd.merge(upper, stats_df, on="id")
full.dropna()
#seaborn.countplot(x="push_class", order=['Full pusher', 'Moderate pusher', 'Moderate other', 'Full other'], data=full)
ax = seaborn.countplot(x="push_class_LogisticRegression", order=['Pusher', 'Non-Pusher'], data=full)
ax.set_title("Logistic Regression")

In [None]:
seaborn.histplot(data=stats_df.loc[stats_df['push_class_LogisticRegression'] == 'Pusher'], x="m", color="red").set(
        title='m distribution',
        xlabel='score',
        ylabel='Count') 

seaborn.histplot(data=stats_df.loc[stats_df['push_class_LogisticRegression'] == 'Non-Pusher'], x="m", color="blue").set(
        title='m distribution (logistic regression)',
        xlabel='score',
        ylabel='Count') 

In [None]:
ax2 = seaborn.countplot(x="push_class_KMeans", order=['Pusher', 'Non-Pusher'], data=full)
ax2.set_title("KMeans")

In [None]:
seaborn.histplot(data=stats_df.loc[stats_df['push_class_KMeans'] == 'Pusher'], x="m", color="red").set(
        title='m distribution',
        xlabel='score',
        ylabel='Count') 

seaborn.histplot(data=stats_df.loc[stats_df['push_class_KMeans'] == 'Non-Pusher'], x="m", color="blue").set(
        title='m distribution (k means classifier)',
        xlabel='score',
        ylabel='Count') 