# Processament de dades amb sklearn

Partirem de les dades de full_df que trobareu al directori /data. Carregau les dades d'allà a un nou dataframe.

Emprant Pipelines i ColumnTransformers transforma les dades seguint les seguents indicacions:

* No toquis user_id ni movie_id. Fica'ls directament al dataframe resultant al final de l'exercici.
* Els unics valors nuls els trobam a les columnes age, gender i occupation. Imputa per a cadascun d'ells el següent: 
    * age: Mitja aritmètica.
    * gender: Emplena amb el valor 'unknown'.
    * occupation: Valor 'none' (aquest ja el tenim a les dades originals.
* Tracta rating i age timestamp com a valors numèrics. Normalitza'ls emprant StandardScaler
* Transforma gender i occupation com a un característiques nominals. 

El resultat final serà un nou dataframe amb les característiques transformades que mantengui els noms de les columnes.

In [2]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

df = pd.read_csv('./data/full_df.csv',
                index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100003 entries, 0 to 100002
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   user_id     100003 non-null  int64  
 1   movie_id    100003 non-null  int64  
 2   rating      100003 non-null  int64  
 3   timestamp   100003 non-null  int64  
 4   age         100000 non-null  float64
 5   gender      100000 non-null  object 
 6   occupation  100000 non-null  object 
dtypes: float64(1), int64(4), object(2)
memory usage: 6.1+ MB


In [14]:
passthrough_cols = ['user_id', 'movie_id']

number_transformer = Pipeline(steps=[
    ("age_imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
gender_transformer = Pipeline(steps=[
    ("gender_imputer", SimpleImputer(strategy="constant", fill_value="none")),
    ("encoder", OneHotEncoder())
])
occupation_transformer = Pipeline(steps=[
    ("occupation_imputer", SimpleImputer(strategy="constant",  fill_value="unknown")),
    ("encoder", OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('passthrough', 'passthrough', passthrough_cols), 
    ('number_transformer', number_transformer, ["age","timestamp","rating"]), 
    ('nominal_gender', gender_transformer, ["gender"]), 
    ('nominals_occupation', occupation_transformer, ["occupation"]), 
   
],
remainder='passthrough'  
)

In [23]:


pipe = Pipeline(steps=[('preprocessor', preprocessor)])
transformed_data = pipe.fit_transform(df).toarray()
df_transformed = pd.DataFrame(transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.0,50.0,0.000000,-0.426260,1.305976,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,172.0,0.000000,-0.426260,1.305976,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,133.0,0.000000,-0.426260,-2.247372,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,196.0,242.0,1.386404,-0.426260,-0.470698,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,186.0,302.0,0.521531,1.532433,-0.470698,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99998,880.0,476.0,-1.727138,-0.627524,-0.470698,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
99999,716.0,204.0,0.262069,-0.698616,1.305976,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100000,276.0,1090.0,-1.035240,-1.634239,-2.247372,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
100001,13.0,225.0,1.213430,-0.211392,-1.359035,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
feature_names = pipe.named_steps['preprocessor'].get_feature_names_out()
feature_names

array(['passthrough__user_id', 'passthrough__movie_id',
       'number_transformer__age', 'number_transformer__timestamp',
       'number_transformer__rating', 'nominal_gender__gender_F',
       'nominal_gender__gender_M', 'nominal_gender__gender_none',
       'nominals_occupation__occupation_administrator',
       'nominals_occupation__occupation_artist',
       'nominals_occupation__occupation_doctor',
       'nominals_occupation__occupation_educator',
       'nominals_occupation__occupation_engineer',
       'nominals_occupation__occupation_entertainment',
       'nominals_occupation__occupation_executive',
       'nominals_occupation__occupation_healthcare',
       'nominals_occupation__occupation_homemaker',
       'nominals_occupation__occupation_lawyer',
       'nominals_occupation__occupation_librarian',
       'nominals_occupation__occupation_marketing',
       'nominals_occupation__occupation_none',
       'nominals_occupation__occupation_other',
       'nominals_occupation__

In [25]:
transformed_df = pd.DataFrame(transformed_data, columns=feature_names)

# Delete preprocessor prefix
transformed_df.columns = [col.split('__')[-1] for col in transformed_df.columns] 

transformed_df

Unnamed: 0,user_id,movie_id,age,timestamp,rating,gender_F,gender_M,gender_none,occupation_administrator,occupation_artist,...,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_unknown,occupation_writer
0,0.0,50.0,0.000000,-0.426260,1.305976,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,172.0,0.000000,-0.426260,1.305976,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,133.0,0.000000,-0.426260,-2.247372,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,196.0,242.0,1.386404,-0.426260,-0.470698,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,186.0,302.0,0.521531,1.532433,-0.470698,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99998,880.0,476.0,-1.727138,-0.627524,-0.470698,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
99999,716.0,204.0,0.262069,-0.698616,1.305976,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100000,276.0,1090.0,-1.035240,-1.634239,-2.247372,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
100001,13.0,225.0,1.213430,-0.211392,-1.359035,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# resultado final

0,1,2
,func,<function act...001712F4668E0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,


# Function Transformer

Analitza els distints valors que trobam a la característica occupation. Crea un FunctionTransformer al qual li passem l'occupation i ens crei una nova característica anomenada 'active', el qual sigui 1 si la persona està treballant i 0 si no treballa.

In [34]:
def active_ocupation(df):
    non_occupation_names = ["occupation_none","occupation_retired","occupation_student","occupation_writer"]
    df["active"] = (transformed_df[non_occupation_names].sum(axis=1) == 0).astype(int)
    
transformer_active_ocupation = FunctionTransformer(active_ocupation, validate=False)
transformer_active_ocupation

0,1,2
,func,<function act...001F0B9E80860>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,


In [35]:
transformer_active_ocupation.fit_transform(transformed_df)
transformed_df

Unnamed: 0,user_id,movie_id,age,timestamp,rating,gender_F,gender_M,gender_none,occupation_administrator,occupation_artist,...,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_unknown,occupation_writer,active
0,0.0,50.0,0.000000,-0.426260,1.305976,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0.0,172.0,0.000000,-0.426260,1.305976,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,0.0,133.0,0.000000,-0.426260,-2.247372,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,196.0,242.0,1.386404,-0.426260,-0.470698,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,186.0,302.0,0.521531,1.532433,-0.470698,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99998,880.0,476.0,-1.727138,-0.627524,-0.470698,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
99999,716.0,204.0,0.262069,-0.698616,1.305976,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
100000,276.0,1090.0,-1.035240,-1.634239,-2.247372,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
100001,13.0,225.0,1.213430,-0.211392,-1.359035,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
