In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import os
import joblib

In [4]:
df =  pd.read_csv('df_worlds.csv')
df['name'] = df['name'].astype(str).str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()

# Agregar ranking WCA
ranking_wca = {
    'Yiheng Wang': 1,
    'Xuanyi Geng': 2,
    'Tymon Kolasiński': 3,
    'Ruihang Xu': 4,
    'Max Park': 5,
    'Teodor Zajder': 6,
    'Luke Garrett': 7,
    'Bofan Zhang': 8,
    'Matty Hiroto Inaba': 9
}
df['ranking_wca'] = df['name'].map(ranking_wca)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752 entries, 0 to 751
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pos              752 non-null    int64  
 1   name             752 non-null    object 
 2   event_id         752 non-null    int64  
 3   wca_id           752 non-null    object 
 4   average_seconds  752 non-null    float64
 5   ranking_wca      752 non-null    int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 35.4+ KB


Unnamed: 0,pos,name,event_id,wca_id,average_seconds,ranking_wca
0,2,Bofan Zhang,333,2021ZHAN01,5.62,8
1,4,Bofan Zhang,333,2021ZHAN01,5.78,8
2,3,Bofan Zhang,333,2021ZHAN01,5.56,8
3,2,Bofan Zhang,333,2021ZHAN01,5.72,8
4,2,Bofan Zhang,333,2021ZHAN01,6.06,8


In [5]:
df['en_podio'] = (df['pos'] <= 3).astype(int)

In [6]:
label_encoder = LabelEncoder()
df['name'] = label_encoder.fit_transform(df['name'])
#label_encoder = LabelEncoder()
#df['wca_id'] = label_encoder.fit_transform(df['wca_id'])

In [26]:
X = df[['average_seconds', 'name']]
y = df['en_podio']

In [27]:
y
#X

0      1
1      0
2      1
3      1
4      1
      ..
747    1
748    1
749    1
750    1
751    1
Name: en_podio, Length: 752, dtype: int32

In [28]:
df.groupby('name')['en_podio'].mean()


name
0    0.519231
1    0.939024
2    0.882353
3    0.986301
4    0.645161
5    0.807692
6    1.000000
7    0.983051
8    0.977099
Name: en_podio, dtype: float64

In [29]:
df['en_podio_prob'] = df.groupby('name')['en_podio'].transform('mean')
df['en_podio_prob']


0      0.519231
1      0.519231
2      0.519231
3      0.519231
4      0.519231
         ...   
747    0.977099
748    0.977099
749    0.977099
750    0.977099
751    0.977099
Name: en_podio_prob, Length: 752, dtype: float64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)#distribucion de rango 80-20

In [31]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
#y_pred = rf.predict(X_test)

In [32]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf.score(X_test, y_test)

0.8849557522123894

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.48      0.51        29
           1       0.93      0.94      0.93       197

    accuracy                           0.88       226
   macro avg       0.73      0.71      0.72       226
weighted avg       0.88      0.88      0.88       226



In [33]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 14  15]
 [ 11 186]]


In [34]:
features =  pd.DataFrame(rf.feature_importances_, index = X.columns)
features.head()

Unnamed: 0,0
average_seconds,0.755994
name,0.244006


In [35]:
#hyperparameters
rf2 = RandomForestClassifier(n_estimators= 1000, criterion= 'entropy', min_samples_split= 10, max_depth=14, random_state=17)

In [36]:
rf2.fit(X_train, y_train)

In [37]:
rf2.score(X_test, y_test)

0.9203539823008849

In [38]:
y_pred2 = rf2.predict(X_test)


In [39]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.74      0.59      0.65        29
           1       0.94      0.97      0.95       197

    accuracy                           0.92       226
   macro avg       0.84      0.78      0.80       226
weighted avg       0.91      0.92      0.92       226



In [40]:
joblib.dump(rf, "./random_forest.joblib")

['./random_forest.joblib']

In [100]:
df_ultimos = df.groupby('name', group_keys=False).apply(
    lambda g: g.tail(max(1, int(len(g) * 0.2)))['average_seconds'].mean()
).reset_index(name='promedio_ultimos')

  df_ultimos = df.groupby('name', group_keys=False).apply(


In [101]:
df_ultimos

Unnamed: 0,name,promedio_ultimos
0,0,5.68
1,1,6.2525
2,2,5.834
3,3,5.647143
4,4,5.388333
5,5,5.828
6,6,5.206
7,7,4.806522
8,8,4.517308


In [102]:
df_ultimos['namedeencoded'] = label_encoder.inverse_transform(df_ultimos['name'])
df_ultimos['namedeencoded']

0           Bofan Zhang
1          Luke Garrett
2    Matty Hiroto Inaba
3              Max Park
4            Ruihang Xu
5         Teodor Zajder
6      Tymon Kolasiński
7           Xuanyi Geng
8           Yiheng Wang
Name: namedeencoded, dtype: object

In [25]:
y_pred2

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1])