In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer


In [2]:
df_gym = pd.read_csv('../dataset/megaGymDataset_clean.csv')
df_gym_clean = df_gym.copy()

- Una vez cargado nuestro dataset, vamos a poceder a preprocesar nuestros datos para poder entrenar a nuestro modelo

In [3]:
df_gym.head(5)

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating
0,Partner plank band row,The partner plank band row is an abdominal exe...,Strength,Abdominals,Bands,Intermediate,0.0
1,Banded crunch isometric hold,The banded crunch isometric hold is an exercis...,Strength,Abdominals,Bands,Intermediate,5.91969
2,FYR Banded Plank Jack,The banded plank jack is a variation on the pl...,Strength,Abdominals,Bands,Intermediate,5.91969
3,Banded crunch,The banded crunch is an exercise targeting the...,Strength,Abdominals,Bands,Intermediate,5.91969
4,Crunch,The crunch is a popular core exercise targetin...,Strength,Abdominals,Bands,Intermediate,5.91969


- Le haremos las siguientes modificaciones a nuestro dataset :

    - Tokenizar variables categ√≥ricas

    - Clasificar numericamente columnas como *'Type'* , *'BodyPart'* , *'Equipment'* o *'Level'*

    - Prescindir de la columna *'Rating'*

***TOKENIZAR COLUMNA *'DESC'***

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_descs = []
for desc in df_gym_clean['Desc']:
    tokens = tokenizer.encode(desc, add_special_tokens=True)
    tokenized_descs.append(tokens)
    
df_gym_clean['Desc'] = tokenized_descs




***TOKENIZAR COLUMNA *'TITLE'***

In [5]:
tokenized_descs = []
for desc in df_gym_clean['Title']:
    tokens = tokenizer.encode(desc, add_special_tokens=True)
    tokenized_descs.append(tokens)
    
df_gym_clean['Title'] = tokenized_descs

In [6]:
df_gym_clean.head(10)

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating
0,"[101, 4256, 24000, 2316, 5216, 102]","[101, 1996, 4256, 24000, 2316, 5216, 2003, 201...",Strength,Abdominals,Bands,Intermediate,0.0
1,"[101, 25264, 24514, 11163, 12589, 2907, 102]","[101, 1996, 25264, 24514, 11163, 12589, 2907, ...",Strength,Abdominals,Bands,Intermediate,5.91969
2,"[101, 1042, 12541, 25264, 24000, 2990, 102]","[101, 1996, 25264, 24000, 2990, 2003, 1037, 83...",Strength,Abdominals,Bands,Intermediate,5.91969
3,"[101, 25264, 24514, 102]","[101, 1996, 25264, 24514, 2003, 2019, 6912, 14...",Strength,Abdominals,Bands,Intermediate,5.91969
4,"[101, 24514, 102]","[101, 1996, 24514, 2003, 1037, 2759, 4563, 691...",Strength,Abdominals,Bands,Intermediate,5.91969
5,"[101, 6689, 2316, 2811, 4133, 1011, 2039, 102]","[101, 1996, 6689, 2316, 2811, 4133, 1011, 2039...",Strength,Abdominals,Bands,Intermediate,5.91969
6,"[101, 1042, 12541, 2475, 25264, 10729, 10216, ...","[101, 2053, 4078, 2278, 102]",Strength,Abdominals,Bands,Intermediate,5.91969
7,"[101, 2316, 2659, 1011, 2000, 1011, 2152, 9792...","[101, 1996, 2316, 2659, 1011, 2000, 1011, 2152...",Strength,Abdominals,Bands,Intermediate,5.91969
8,"[101, 3347, 17327, 4897, 1011, 2041, 102]","[101, 1996, 3347, 17327, 4897, 1011, 2041, 200...",Strength,Abdominals,Barbell,Intermediate,8.9
9,"[101, 3347, 17327, 11113, 4897, 5833, 1011, 20...","[101, 1996, 3347, 17327, 4897, 1011, 2041, 200...",Strength,Abdominals,Barbell,Intermediate,8.9


- CLASIFICAR VARIABLES CATEGORICAS :

    - Columna *'Type'*

    - Columna *'BodyPart'*

    - Columna *'Equipment'*
    
    - Columna *'Level'*


In [7]:
print(f"Los valores de la columna Type  son : {df_gym_clean['Type'].unique()}")
print(f"Los valores de la columna BodyPart  son : {df_gym_clean['BodyPart'].unique()}")
print(f"Los valores de la columna Equipment  son : {df_gym_clean['Equipment'].unique()}")
print(f"Los valores de la columna Level  son : {df_gym_clean['Level'].unique()}")

Los valores de la columna Type  son : ['Strength' 'Plyometrics' 'Cardio' 'Stretching' 'Powerlifting' 'Strongman'
 'Olympic Weightlifting']
Los valores de la columna BodyPart  son : ['Abdominals' 'Adductors' 'Abductors' 'Biceps' 'Calves' 'Chest' 'Forearms'
 'Glutes' 'Hamstrings' 'Lats' 'Lower Back' 'Middle Back' 'Traps' 'Neck'
 'Quadriceps' 'Shoulders' 'Triceps']
Los valores de la columna Equipment  son : ['Bands' 'Barbell' 'Kettlebells' 'Dumbbell' 'Other' 'Cable' 'Machine'
 'Body Only' 'Medicine Ball' 'Exercise Ball' 'Foam Roll' 'E-Z Curl Bar']
Los valores de la columna Level  son : ['Intermediate' 'Beginner' 'Expert']


In [8]:
label_encoder = LabelEncoder()

df_gym_clean['Type'] = label_encoder.fit_transform(df_gym_clean['Type'])
df_gym_clean['BodyPart'] = label_encoder.fit_transform(df_gym_clean['BodyPart'])
df_gym_clean['Equipment'] = label_encoder.fit_transform(df_gym_clean['Equipment'])
df_gym_clean['Level'] = label_encoder.fit_transform(df_gym_clean['Level'])

In [9]:
print(f"Los valores de la columna Type ahora son : {df_gym_clean['Type'].unique()}")
print(f"Los valores de la columna BodyPart ahora son : {df_gym_clean['BodyPart'].unique()}")
print(f"Los valores de la columna Equipment ahora son : {df_gym_clean['Equipment'].unique()}")
print(f"Los valores de la columna Level ahora son : {df_gym_clean['Level'].unique()}")

Los valores de la columna Type ahora son : [4 2 0 5 3 6 1]
Los valores de la columna BodyPart ahora son : [ 0  2  1  3  4  5  6  7  8  9 10 11 15 12 13 14 16]
Los valores de la columna Equipment ahora son : [ 0  1  8  4 11  3  9  2 10  6  7  5]
Los valores de la columna Level ahora son : [2 0 1]


- Los valores de la columna *'Type'* ahora son : 

    - Strength : 4

    - Plyometrics : 2

    - Cardio : 0

    - Stretching : 5

    - Powerlifting : 3

    - Strongman : 6
    
    - Olympic Weightlifting : 1

- Los valores de la columna *'Bodypart'* ahora son : 

    - Abdominals : 0

    - Adductors : 2

    - Abductors : 1

    - Biceps : 3

    - Calves : 4

    - Chest : 5
    
    - Forearms : 6

    - Glutes : 7

    - Hamstrings : 8

    - Lats : 9

    - Lower Back : 10

    - Middle Back : 11

    - Traps : 15

    - Neck : 12

    - Quadriceps : 13

    - Shoulders : 14

    - Triceps : 16

- Los valores de la columna *'Equipment'* ahora son : 

    - Bands : 0

    - Barbell : 1

    - Kettlebells : 8

    - Dumbbell : 4

    - Other : 11

    - Cable : 3
    
    - Machine : 9

    - Body Only : 2

    - Medicine Ball : 10

    - Exercise Ball : 6

    - Foam Roll : 7

    - E-Z Curl Bar : 5

- Los valores de la columna *'Level'* ahora son : 

    - Intermediate : 2

    - Beginner : 0

    - Expert : 1

- Redondeamos los valores de la columna *'Rating'* y lo convertimos de Float a Int 

In [10]:
df_gym_clean['Rating'] = df_gym_clean['Rating'].round().astype(int)


- Comparamos el dataset inicial sin modificar y el dataset preprocesado para poder entrenar nuestro modelo

In [11]:
df_gym.head(5)

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating
0,Partner plank band row,The partner plank band row is an abdominal exe...,Strength,Abdominals,Bands,Intermediate,0.0
1,Banded crunch isometric hold,The banded crunch isometric hold is an exercis...,Strength,Abdominals,Bands,Intermediate,5.91969
2,FYR Banded Plank Jack,The banded plank jack is a variation on the pl...,Strength,Abdominals,Bands,Intermediate,5.91969
3,Banded crunch,The banded crunch is an exercise targeting the...,Strength,Abdominals,Bands,Intermediate,5.91969
4,Crunch,The crunch is a popular core exercise targetin...,Strength,Abdominals,Bands,Intermediate,5.91969


In [12]:
df_gym_clean.head(20)

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating
0,"[101, 4256, 24000, 2316, 5216, 102]","[101, 1996, 4256, 24000, 2316, 5216, 2003, 201...",4,0,0,2,0
1,"[101, 25264, 24514, 11163, 12589, 2907, 102]","[101, 1996, 25264, 24514, 11163, 12589, 2907, ...",4,0,0,2,6
2,"[101, 1042, 12541, 25264, 24000, 2990, 102]","[101, 1996, 25264, 24000, 2990, 2003, 1037, 83...",4,0,0,2,6
3,"[101, 25264, 24514, 102]","[101, 1996, 25264, 24514, 2003, 2019, 6912, 14...",4,0,0,2,6
4,"[101, 24514, 102]","[101, 1996, 24514, 2003, 1037, 2759, 4563, 691...",4,0,0,2,6
5,"[101, 6689, 2316, 2811, 4133, 1011, 2039, 102]","[101, 1996, 6689, 2316, 2811, 4133, 1011, 2039...",4,0,0,2,6
6,"[101, 1042, 12541, 2475, 25264, 10729, 10216, ...","[101, 2053, 4078, 2278, 102]",4,0,0,2,6
7,"[101, 2316, 2659, 1011, 2000, 1011, 2152, 9792...","[101, 1996, 2316, 2659, 1011, 2000, 1011, 2152...",4,0,0,2,6
8,"[101, 3347, 17327, 4897, 1011, 2041, 102]","[101, 1996, 3347, 17327, 4897, 1011, 2041, 200...",4,0,1,2,9
9,"[101, 3347, 17327, 11113, 4897, 5833, 1011, 20...","[101, 1996, 3347, 17327, 4897, 1011, 2041, 200...",4,0,1,2,9


In [13]:
##df_gym_clean.to_csv('/home/ayala/Escritorio/BOOTCAMP IA/TRABAJO FIN DE BOOTCAMP/TRABAJO/lab/dataset/megaGymDataset_final.csv' , index=False)