# Processament de dades amb sklearn

Partirem de les dades de full_df que trobareu al directori /data. Carregau les dades d'allà a un nou dataframe.

Emprant Pipelines i ColumnTransformers transforma les dades seguint les seguents indicacions:

* No toquis user_id ni movie_id. Fica'ls directament al dataframe resultant al final de l'exercici.
* Els unics valors nuls els trobam a les columnes age, gender i occupation. Imputa per a cadascun d'ells el següent: 
    * age: Mitja aritmètica.
    * gender: Emplena amb el valor 'unknown'.
    * occupation: Valor 'none' (aquest ja el tenim a les dades originals.
* Tracta rating i age timestamp com a valors numèrics. Normalitza'ls emprant StandardScaler
* Transforma gender i occupation com a un característiques nominals. 

El resultat final serà un nou dataframe amb les característiques transformades que mantengui els noms de les columnes.

In [485]:
# Imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [486]:
original_df = pd.read_csv('./data/full_df.csv', index_col=0)

# Copy to work on
df = original_df.copy()
df

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation
0,0,50,5,881250949,,,
1,0,172,5,881250949,,,
2,0,133,1,881250949,,,
3,196,242,3,881250949,49.0,M,writer
4,186,302,3,891717742,39.0,F,executive
...,...,...,...,...,...,...,...
99998,880,476,3,880175444,13.0,M,student
99999,716,204,5,879795543,36.0,F,administrator
100000,276,1090,1,874795795,21.0,M,student
100001,13,225,2,882399156,47.0,M,educator


In [487]:
def is_active(df):
    return pd.DataFrame({
        'active': (df['occupation'].fillna('none').astype(str).str.lower() != 'none').astype(int)
    })


In [488]:
# Los valores nulos estan en ['age', 'occupation', 'gender']
integer_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

occupation_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='none')),
    ('encoder', OneHotEncoder()),
])

gender_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('int_scaler', integer_transformer, ['rating', 'timestamp', 'age']),
    ('gender', gender_transformer, ['gender']),
    ('occupation', occupation_transformer, ['occupation']),
    ('active', FunctionTransformer(func=is_active), ['occupation']),

],
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


In [489]:
transformed = pipeline.fit_transform(df).toarray()

In [490]:
df_transformed = pd.DataFrame(transformed)
df_transformed


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1.305976,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.305976,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.247372,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.470698,-0.426260,1.386404,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,-0.470698,1.532433,0.521531,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99998,-0.470698,-0.627524,-1.727138,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
99999,1.305976,-0.698616,0.262069,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
100000,-2.247372,-1.634239,-1.035240,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
100001,-1.359035,-0.211392,1.213430,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [491]:
transformed_columns = []
transformed_columns = np.append(transformed_columns, pipeline.named_steps['preprocessor']['int_scaler'].get_feature_names_out())
transformed_columns = np.append(transformed_columns, pipeline.named_steps['preprocessor']['gender'].get_feature_names_out())
transformed_columns = np.append(transformed_columns, pipeline.named_steps['preprocessor']['occupation'].get_feature_names_out())
transformed_columns = np.append(transformed_columns, ['active'])

transformed_columns.size

28

In [492]:
df_transformed.columns = transformed_columns
df_transformed.insert(0, 'user_id', original_df['user_id'])
df_transformed.insert(1, 'movie_id', original_df['movie_id'])
df_transformed

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender_F,gender_M,gender_unknown,occupation_administrator,occupation_artist,...,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,active
0,0,50,1.305976,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,172,1.305976,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,133,-2.247372,-0.426260,0.000000,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,196,242,-0.470698,-0.426260,1.386404,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,186,302,-0.470698,1.532433,0.521531,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99998,880,476,-0.470698,-0.627524,-1.727138,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
99999,716,204,1.305976,-0.698616,0.262069,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
100000,276,1090,-2.247372,-1.634239,-1.035240,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
100001,13,225,-1.359035,-0.211392,1.213430,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Function Transformer

Analitza els distints valors que trobam a la característica occupation. Crea un FunctionTransformer al qual li passem l'occupation i ens crei una nova característica anomenada 'active', el qual sigui 1 si la persona està treballant i 0 si no treballa.