# Processament de dades amb sklearn

Partirem de les dades de full_df que trobareu al directori /data. Carregau les dades d'allà a un nou dataframe.

Emprant Pipelines i ColumnTransformers transforma les dades seguint les seguents indicacions:

* No toquis user_id ni movie_id. Fica'ls directament al dataframe resultant al final de l'exercici.
* Els unics valors nuls els trobam a les columnes age, gender i occupation. Imputa per a cadascun d'ells el següent: 
    * age: Mitja aritmètica.
    * gender: Emplena amb el valor 'unknown'.
    * occupation: Valor 'none' (aquest ja el tenim a les dades originals.
* Tracta rating i age timestamp com a valors numèrics. Normalitza'ls emprant StandardScaler
* Transforma gender i occupation com a un característiques nominals. 

El resultat final serà un nou dataframe amb les característiques transformades que mantengui els noms de les columnes.

In [46]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

df = pd.read_csv('./data/full_df.csv',
                index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100003 entries, 0 to 100002
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   user_id     100003 non-null  int64  
 1   movie_id    100003 non-null  int64  
 2   rating      100003 non-null  int64  
 3   timestamp   100003 non-null  int64  
 4   age         100000 non-null  float64
 5   gender      100000 non-null  object 
 6   occupation  100000 non-null  object 
dtypes: float64(1), int64(4), object(2)
memory usage: 6.1+ MB


In [55]:
passthrough_cols = ['user_id', 'movie_id']

preprocessor = ColumnTransformer(transformers=[
    ('passthrough', 'passthrough', passthrough_cols), 
    ('mean_age', SimpleImputer(strategy="mean"), ['age']),
    ('unkown_gender', SimpleImputer(strategy="constant", fill_value='unknown'), ['gender']),
    ('none_occupation', SimpleImputer(strategy="constant", fill_value='none'), ['occupation']),
    ('scale_numerics', StandardScaler(), ["rating", "age", "timestamp"]),
    ('nominals', OneHotEncoder(), ["gender", "occupation"])
],
remainder='passthrough'  
)

In [52]:


pipe = Pipeline(steps=[('preprocessor', preprocessor)])
transformed_data = pipe.fit_transform(df)

In [53]:
feature_names = pipe.named_steps['preprocessor'].get_feature_names_out()
feature_names

array(['passthrough__user_id', 'passthrough__movie_id', 'mean_age__age',
       'unkown_gender__gender', 'none_occupation__occupation',
       'scale_numerics__rating', 'scale_numerics__age',
       'scale_numerics__timestamp', 'nominals__gender_F',
       'nominals__gender_M', 'nominals__gender_nan',
       'nominals__occupation_administrator',
       'nominals__occupation_artist', 'nominals__occupation_doctor',
       'nominals__occupation_educator', 'nominals__occupation_engineer',
       'nominals__occupation_entertainment',
       'nominals__occupation_executive',
       'nominals__occupation_healthcare',
       'nominals__occupation_homemaker', 'nominals__occupation_lawyer',
       'nominals__occupation_librarian', 'nominals__occupation_marketing',
       'nominals__occupation_none', 'nominals__occupation_other',
       'nominals__occupation_programmer', 'nominals__occupation_retired',
       'nominals__occupation_salesman', 'nominals__occupation_scientist',
       'nominals__occ

In [59]:
transformed_df = pd.DataFrame(transformed_data, columns=feature_names)

# Delete preprocessor prefix
transformed_df.columns = [col.split('__')[-1] for col in transformed_df.columns] 

transformed_df

Unnamed: 0,user_id,movie_id,age,gender,occupation,rating,age.1,timestamp,gender_F,gender_M,...,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,occupation_nan
0,0,50,32.96985,unknown,none,1.305976,,-0.42626,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,172,32.96985,unknown,none,1.305976,,-0.42626,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,133,32.96985,unknown,none,-2.247372,,-0.42626,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,196,242,49.0,M,writer,-0.470698,1.386383,-0.42626,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,186,302,39.0,F,executive,-0.470698,0.521524,1.532433,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99998,880,476,13.0,M,student,-0.470698,-1.727112,-0.627524,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
99999,716,204,36.0,F,administrator,1.305976,0.262066,-0.698616,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100000,276,1090,21.0,M,student,-2.247372,-1.035224,-1.634239,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
100001,13,225,47.0,M,educator,-1.359035,1.213412,-0.211392,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# resultado final

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender_F,gender_M,gender_unknown,occupation_administrator,occupation_artist,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,0.0,50.0,1.305976,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,172.0,1.305976,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,133.0,-2.247372,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,196.0,242.0,-0.470698,-0.426260,1.386404,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,186.0,302.0,-0.470698,1.532433,0.521531,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99998,880.0,476.0,-0.470698,-0.627524,-1.727138,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99999,716.0,204.0,1.305976,-0.698616,0.262069,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100000,276.0,1090.0,-2.247372,-1.634239,-1.035240,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
100001,13.0,225.0,-1.359035,-0.211392,1.213430,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Function Transformer

Analitza els distints valors que trobam a la característica occupation. Crea un FunctionTransformer al qual li passem l'occupation i ens crei una nova característica anomenada 'active', el qual sigui 1 si la persona està treballant i 0 si no treballa.