In [1]:
import requests
from lxml import html
import pandas as pd
import duckdb as dd
import plotly.express as px

In [2]:
def extract_user_info(raw_user):
    list_user  = raw_user.split('   ')
    list_user = [x.strip() for x in list_user]
    list_user = [x for x in list_user if x != '']
    return list_user
    
def extract_from_string(string, delimiter, index):
    try:
        return string.split(delimiter)[index]
    except:
        return None

In [3]:
url = 'https://www.ilutas.com.br/checagem/list-all/?event=735d5f756b9ba8108028fc535e9c1bea'
response = requests.get(url)
xpath = '/html/body/div[4]/div/div[2]/div[2]/table'
tree = html.fromstring(response.text)
table = tree.xpath(xpath)[0]
table_string = table.text_content()
table_list = table_string.split('\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t')
table_list = [x.replace('\t', '') for x in table_list]
table_list = [x.replace('\n', '') for x in table_list]
table_list = [x.replace("'", "") for x in table_list]
table_list = [x for x in table_list if "-" in x] 

dict_keys = ['name', 'modallity', 'category', 'team', 'number', 'status']
dict_data = {}
df = pd.DataFrame()

for i in range(0, len(table_list)):
    data = extract_user_info(table_list[i])
    dict_data[i] = dict(zip(dict_keys, data))

df = pd.DataFrame.from_dict(dict_data, orient='index')
df['in_category'] = df['category'].apply(lambda x: 1 if 'Categoria de Peso' in x else 0)
df['in_absolute'] = df['category'].apply(lambda x: 1 if 'Absoluto Jiu-Jitsu' in x else 0)
df['gender'] = df['category'].apply(lambda x: 'F' if 'FEMININO' in  x else 'M')
df['age_category'] = df['category'].apply(lambda x: x.split('-')[0])
df['adult'] = df['age_category'].apply(lambda x: 0 if '(' in x else 1)
df['weight_category'] = df['category'].apply(lambda x: extract_from_string(x, '-', 2))
df['belt'] = df['category'].apply(lambda x: extract_from_string(x, '-', 1))
df['age_category_non_adult'] = df['category'].apply(lambda x: extract_from_string(x, '(' , 0))
df['belt_non_adult'] = df['category'].apply(lambda x: extract_from_string(x, ')' , 1))
df['belt_non_adult'] = df['belt_non_adult'].apply(lambda x: extract_from_string(x, '(' , 0))
df['belt_non_adult'] = df['belt_non_adult'].apply(lambda x: extract_from_string(x, ' ' , 1))
df['weight_category_non_adult'] = df.apply(lambda x: extract_from_string(x['category'], x['belt_non_adult'], 1), axis=1)
df['weight_category_non_adult'] = df.apply(lambda x: extract_from_string(x['weight_category_non_adult'], ' (', 0), axis=1)
df['age_category'] = df.apply(lambda x: x['age_category'] if x['adult'] == 1 else x['age_category_non_adult'], axis=1)
df['belt'] = df.apply(lambda x: x['belt'] if x['adult'] == 1 else x['belt_non_adult'], axis=1)
df['weight_category'] = df.apply(lambda x: x['weight_category'] if x['adult'] == 1 else x['weight_category_non_adult'], axis=1)
df['confirmed'] = df['status'].apply(lambda x: 1 if 'Confirmado' in x else 0)
df = df.drop(columns=['age_category_non_adult', 'belt_non_adult', 'weight_category_non_adult', 'category', 'modallity', 'status'])
df.to_csv('ilutas.csv', index=False)
df.head(5)

Unnamed: 0,name,team,number,in_category,in_absolute,gender,age_category,adult,weight_category,belt,confirmed
0,ABNER BASTOS CAVALCANTE,BLEC BRUNO LEANDRO,18,0,0,M,JUVENIL,1,SUPER PESADO,AZUL,1
1,ADRIANO FIDELES BARROSO,GRACIE BARRA FORTALEZA RICARDO PINHO,273,0,0,M,ADULTO,1,LEVE,AZUL,0
2,ALEXANDRE FERREIRA DA CONCEIÇÃO,GRACIE BARRA FORTALEZA DANIEL HOLANDA BAIMA,267,0,0,M,ADULTO,1,MéDIO,ROXA,0
3,ALEXANDRE PEREIRA DA SILVA,TERÊNCIO ABREU TERÊNCIO ABREU,370,0,0,M,ADULTO,1,SUPER PESADO,BRANCA,0
4,ALISON MATEUS BRANDAO DE LIMA,BD TEAM BJJ ALISSON BDL,360,0,0,M,MASTER 2,1,PESADíSSIMO,PRETA,0


In [4]:
# per team count 
df_team = df.copy()
df_team = df_team.query('confirmed == 1')
df_team['team'] = df_team['team'].apply(lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1] if len(x.split(' ')) > 1 else x)
df_team = df_team.groupby('team').count().reset_index()
df_team.sort_values(by='name', ascending=False, inplace=True)
df_team.reset_index(drop=True, inplace=True)
df_team.head(10)

Unnamed: 0,team,name,number,in_category,in_absolute,gender,age_category,adult,weight_category,belt,confirmed
0,BD TEAM,23,23,23,23,23,23,23,22,22,23
1,GRACIE BARRA,15,15,15,15,15,15,15,15,15,15
2,KOA JIU,5,5,5,5,5,5,5,5,5,5
3,NOVA UNIÃO,5,5,5,5,5,5,5,5,5,5
4,ALLIANCE FORTALEZA,3,3,3,3,3,3,3,3,3,3
5,SAS TEAM,3,3,3,3,3,3,3,3,3,3
6,GRACIE HUMAITÁ,3,3,3,3,3,3,3,3,3,3
7,UBUNTU JIU-JITSU,2,2,2,2,2,2,2,2,2,2
8,SER JIU-JITSU,2,2,2,2,2,2,2,2,2,2
9,PITBULL BROTHERS,2,2,2,2,2,2,2,2,2,2


In [5]:
# bar plot with the number of athletes per team, top 10 teams
df_plot = df_team.copy()
df_plot['rank'] = df_plot.index + 1
df_plot['team'] = df_plot.apply(lambda x: x['team'] if x['rank'] <= 10 else 'OUTROS', axis=1)
df_plot['atlhetes'] = df_plot['name']
df_plot = df_plot.groupby('team').sum().reset_index()
df_plot = df_plot.sort_values(by='rank', ascending=True)


fig = px.bar(df_plot, x='team', y='atlhetes', title='Número de atletas por equipe (Top 10)', labels={'team': 'Equipe', 'atlhetes': 'Número de atletas'}, height=500, width=800)

#background color - white
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 0, 0)',})

# color of others bars
fig.update_traces(marker_color='lightblue', marker_line_color='gray', marker_line_width=1.5, opacity=1)


fig.show()

In [6]:
df_belts = df.copy()
df_belts = df_belts.query('confirmed == 1').query('adult == 1')
df_belts = df_belts.groupby(['belt','adult']).count().reset_index()
df_belts.sort_values(by='name', ascending=False, inplace=True)
df_belts.reset_index(drop=True, inplace=True)
df_belts['athletes'] = df_belts['name']
df_belts['belt'] = df_belts['belt'].apply(lambda x: x.strip())
df_belts['adult'] = df_belts['adult'].apply(lambda x: 'Adulto' if x == 1 else 'Não Adulto')
color_dict = {'BRANCA' : 'white', 'BRANCA/CINZA' : 'gray', 'CINZA' : 'gray' , 'VERDE' : 'green', 'AZUL' : 'blue', 'ROXA' : 'purple', 'MARROM' : 'brown', 'PRETA' : 'black', 'COLORIDA' : 'yellow',
              '(?)':'lightblue'}
fig = px.sunburst(df_belts, path=['adult','belt'], values='athletes', title='', color='belt', color_discrete_map=color_dict, height=500, width=500)
#update central text
fig.update_traces(textinfo='label+percent entry')
#update background color of the central circle
fig.update_traces(marker=dict(line=dict(color='gray', width=2)))
fig.show()
