# Pergunta:

## Peso, altura e idade influenciam se o atleta ganha medalha?

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error

In [2]:
df = pd.read_csv('C:\\01-FaculdadeSemestreAtual\\Projeto_Integrador_VI\\Datasets\\athlete_events.csv')

df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      271116 non-null  int64  
 1   Name    271116 non-null  object 
 2   Sex     271116 non-null  object 
 3   Age     261642 non-null  float64
 4   Height  210945 non-null  float64
 5   Weight  208241 non-null  float64
 6   Team    271116 non-null  object 
 7   NOC     271116 non-null  object 
 8   Games   271116 non-null  object 
 9   Year    271116 non-null  int64  
 10  Season  271116 non-null  object 
 11  City    271116 non-null  object 
 12  Sport   271116 non-null  object 
 13  Event   271116 non-null  object 
 14  Medal   39783 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 31.0+ MB


In [4]:
df.isnull().sum()

ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
dtype: int64

In [5]:
df.shape

(271116, 15)

# Tratamento de dados

## Preenchendo valores nulos

In [8]:
# Calcular a média de Height e Weight para cada atleta (Name)
mean_values_atleta = df.groupby('Name')[['Height', 'Weight']].mean()

# Calcular a média de Height e Weight para cada evento
mean_values_evento = df.groupby('Event')[['Height', 'Weight']].mean()

# Calcular a média geral do DataFrame
mean_height_geral = df['Height'].mean()
mean_weight_geral = df['Weight'].mean()

# Função para calcular a mediana de idade por esporte e ano
median_age_by_year_sport = df.groupby(['Year', 'Sport'])['Age'].median()

# Mediana geral de idade para o DataFrame
median_age_geral = df['Age'].median()

# Função para preencher os valores de Height e Weight
def preencher_valores(row):
    # Se o valor de Height for nulo
    if pd.isnull(row['Height']):
        # Tentar preencher com a média do atleta
        if not pd.isnull(mean_values_atleta.loc[row['Name'], 'Height']):
            row['Height'] = mean_values_atleta.loc[row['Name'], 'Height']
        # Se a média do atleta for nula, preencher com a média do evento
        elif not pd.isnull(mean_values_evento.loc[row['Event'], 'Height']):
            row['Height'] = mean_values_evento.loc[row['Event'], 'Height']
        # Se ainda for nulo, preencher com a média geral
        else:
            row['Height'] = mean_height_geral

    # Se o valor de Weight for nulo
    if pd.isnull(row['Weight']):
        # Tentar preencher com a média do atleta
        if not pd.isnull(mean_values_atleta.loc[row['Name'], 'Weight']):
            row['Weight'] = mean_values_atleta.loc[row['Name'], 'Weight']
        # Se a média do atleta for nula, preencher com a média do evento
        elif not pd.isnull(mean_values_evento.loc[row['Event'], 'Weight']):
            row['Weight'] = mean_values_evento.loc[row['Event'], 'Weight']
        # Se ainda for nulo, preencher com a média geral
        else:
            row['Weight'] = mean_weight_geral

    # Tentar calcular a idade usando o ano de nascimento
    if pd.isnull(row['Age']):
        birth_year = row['Year'] - row['Age'] if not pd.isnull(row['Age']) else None
        if birth_year is None:
            # Usar a mediana da idade por esporte e ano, se existir
            median_age_sport_year = median_age_by_year_sport.get((row['Year'], row['Sport']), None)
            if pd.notnull(median_age_sport_year):
                row['Age'] = median_age_sport_year
            else:
                # Usar a mediana geral de idade
                row['Age'] = median_age_geral
    
    return row

# Aplicar a função em cada linha do dataframe
df = df.apply(preencher_valores, axis=1)


In [9]:
df.isnull().sum()

ID             0
Name           0
Sex            0
Age            0
Height         0
Weight         0
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
dtype: int64

In [11]:
# Transformar medalhas em valores numéricos
medal_mapping = {'Gold': 3, 'Silver': 2, 'Bronze': 1}
df['Medal'] = df['Medal'].map(medal_mapping)
df['Medal'].fillna(0, inplace=True)  # Preencher NaN (sem medalha) com 0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Medal'].fillna(0, inplace=True)  # Preencher NaN (sem medalha) com 0


# Random Forest

In [13]:
# Selecionar features e target
X = df[['Age', 'Height', 'Weight']]
y = df['Medal']

# Separar em dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Modelo Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Avaliação do Random Forest
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.91     69218
         1.0       0.11      0.02      0.03      4051
         2.0       0.15      0.02      0.04      4020
         3.0       0.17      0.04      0.06      4046

    accuracy                           0.84     81335
   macro avg       0.32      0.26      0.26     81335
weighted avg       0.75      0.84      0.78     81335



# Regressão Linear

In [14]:
# Modelo de Regressão Linear
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Avaliação da Regressão Linear
mse = mean_squared_error(y_test, y_pred_lr)
print(f"Mean Squared Error - Regressão Linear: {mse}")

Mean Squared Error - Regressão Linear: 0.6022097363919081


# Comparar

In [21]:
# Exibir previsões e comparar com valores reais
resultados = pd.DataFrame({'Real': y_test, 'RandomForest_Pred': y_pred_rf, 'LinearRegression_Pred': np.round(y_pred_lr)})
print(resultados)

        Real  RandomForest_Pred  LinearRegression_Pred
132959   0.0                0.0                    0.0
72709    3.0                1.0                    0.0
157328   0.0                0.0                    0.0
98898    3.0                0.0                    0.0
236772   0.0                0.0                    0.0
...      ...                ...                    ...
102359   0.0                0.0                    0.0
218501   0.0                0.0                    0.0
167395   0.0                0.0                    0.0
212611   0.0                0.0                    0.0
150514   0.0                0.0                    0.0

[81335 rows x 3 columns]
