### Questão 7

In [1]:
# A. Vetorize as variáveis categóricas usando One-hot Encoding. Apresente os resultados obtidos.
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

file_path = '../db/powerlifting/openpowerlifting.csv'

# Carregar colunas categóricas relevantes
df = pd.read_csv(file_path, usecols=['Sex', 'Equipment', 'Division'])
df = df.dropna(subset=['Sex', 'Equipment', 'Division'])
df_sample = df.head(5).reset_index(drop=True)

print("original:")
print(df_sample)

encoder = OneHotEncoder(sparse_output=False, dtype=int)
onehot_encoded = encoder.fit_transform(df_sample)

onehot_df = pd.DataFrame(
    onehot_encoded,
    columns=encoder.get_feature_names_out(['Sex', 'Equipment', 'Division'])
)

print("\nOne-Hot Encoding:")
onehot_df

original:
  Sex   Equipment     Division
0   F       Wraps    Mst 45-49
1   F  Single-ply    Mst 40-44
2   F  Single-ply  Open Senior
3   F         Raw  Open Senior
4   F         Raw   Teen 18-19

One-Hot Encoding:


Unnamed: 0,Sex_F,Equipment_Raw,Equipment_Single-ply,Equipment_Wraps,Division_Mst 40-44,Division_Mst 45-49,Division_Open Senior,Division_Teen 18-19
0,1,0,0,1,0,1,0,0
1,1,0,1,0,1,0,0,0
2,1,0,1,0,0,0,1,0
3,1,1,0,0,0,0,1,0
4,1,1,0,0,0,0,0,1


In [None]:
# B. Vetorize as variáveis categóricas usando Dummy Coding. Compare os resultados desta vetorização com aqueles obtidos no item (a).
encoder_dummy = OneHotEncoder(sparse_output=False, drop='first', dtype=int)
dummy_encoded = encoder_dummy.fit_transform(df_sample)

dummy_df = pd.DataFrame(
    dummy_encoded,
    columns=encoder_dummy.get_feature_names_out(['Sex', 'Equipment', 'Division'])
)

print("\nDummy Coding (sklearn drop='first'):")
dummy_df


Dummy Coding (sklearn drop='first'):


Unnamed: 0,Equipment_Single-ply,Equipment_Wraps,Division_Mst 45-49,Division_Open Senior,Division_Teen 18-19
0,0,1,1,0,0
1,1,0,0,0,0
2,1,0,0,1,0
3,0,0,0,1,0
4,0,0,0,0,1


### Questão 8

In [39]:
# A. Considere ainda o powerlifting database.

# Carregar colunas categóricas relevantes
df_sample = df.head(10000).reset_index(drop=True)

# i. Compacte as features categóricas da base de dados usando Feature Hashing.
from sklearn.feature_extraction import FeatureHasher

# Converter para dicionário
cat_features = df_sample[['Sex', 'Equipment', 'Division']].astype(str)
data_dicts = cat_features.to_dict(orient='records')

# Aplicar feature hashing (vetor fixo)
hasher = FeatureHasher(n_features=8, input_type='dict')  # vetor de 8 dimensões
hashed_features = hasher.fit_transform(data_dicts)

# Converter para DataFrame
hashed_df = pd.DataFrame(hashed_features.toarray())
print("Feature Hashing:")
hashed_df

Feature Hashing:


Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...
9995,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,1.0
9996,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
9997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
9998,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [40]:
# ii. Compacte as features categóricas da base de dados usando Bin Counting.

# Concatenar todas as colunas categóricas como uma só
all_cats = pd.concat([
    df_sample['Sex'], 
    df_sample['Equipment'], 
    df_sample['Division']
])

# Contar frequência de cada categoria
cat_counts = all_cats.value_counts()

print("\nBin Counting (frequência das categorias):")
cat_counts


Bin Counting (frequência das categorias):


M               7521
Raw             5324
Wraps           2962
F               2479
Open            1053
                ... 
M-E-JNR            1
Untested           1
Youth 12 yrs       1
Youth 8 yrs        1
Military Vet       1
Name: count, Length: 1128, dtype: int64