# Generic training

## Define generic transformers

1. **Dataframe selector**: Select only columns with desired types in order to apply the correct transformations

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer

In [2]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, types):
        self.types = types

    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        return x.select_dtypes(include=self.types).values

2. **Binarizer**: Transfor categorical attr to a numeric encoding so algorithm can deal with it

In [47]:
class Binarizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None,**fit_params):
        return self
        
    def transform(self, X):
        if X.shape[1] == 0:
            return X
        return LabelBinarizer().fit_transform(X)

## Define the pipelines

In [4]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [49]:
num_pipeline = Pipeline([ # Execute all fit_transform sequentially
    ('selector', DataFrameSelector(["int", "float"])), # Select numerical attributes
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([ # Execute all fit_transform sequentially
    ('selector', DataFrameSelector(["object"])), # Select categorical attributes
    ('label_binarizer', Binarizer())
])

# Fusion pipelines
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

## Test with diferent datasets 

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.metrics import accuracy_score, mean_squared_error

In [12]:
def get_train_test(csv_path, target, test_size=0.2):
        df = pd.read_csv(csv_path)

        X = df.drop(target, axis=1).copy()
        y = df[target].copy()

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        return X_train, y_train, X_test, y_test

#### Train diferents models

In [28]:
def train_model(X, y, model_name):
    print("Training {}...".format(model_name))
    
    model = eval(model_name + "()", globals(), locals())
    model.fit(X, y)
    
    print("Done")

    return model

#### Evaluate the models

In [55]:
def evaluate_model(model, X_test, y_test, regression=True):
    """Evaluate model and prints its results
    
    Arguments:
        model {Sklearn model} -- Trained model to be evaluated
        X_test {Dataframe} -- Test data
        y_test {Dataframe} -- Test labels
    """
    y_pred = model.predict(X_test)
    
    if regression:

        final_mse = mean_squared_error(y_test, y_pred)
        final_rmse = np.sqrt(final_mse)
        print(model.__class__.__name__ + " -> "  + "MSE:", final_mse, "RMSE:", final_rmse)
    else:
        print(model.__class__.__name__ + " -> "  + "Accuracy:", accuracy_score(y_test, y_pred))



### Housing dataset (Regression)

In [13]:
X, y, X_test, y_test = get_train_test("datasets\\housing.csv", "median_house_value")

In [14]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND


In [20]:
X_prepared = full_pipeline.fit_transform(X)
X_test_prepared = full_pipeline.fit_transform(X_test)

In [38]:
algorithms = ["LinearRegression", "RandomForestRegressor", "ElasticNet"]
for a in algorithms:
    model = train_model(X_prepared, y, a)
    evaluate_model(model, X_test_prepared, y_test)

Training LinearRegression...
Done
LinearRegression -> MSE: 4917261640.702485 RMSE: 70123.1890368834
Training RandomForestRegressor...




Done
RandomForestRegressor -> MSE: 4108669032.8719044 RMSE: 64098.900402985884
Training ElasticNet...
Done
ElasticNet -> MSE: 6117481096.988762 RMSE: 78214.32795203678


### Iris Dataset

In [39]:
X, y, X_test, y_test = get_train_test("datasets\\iris.csv", "Species")

In [40]:
X.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
22,23,4.6,3.6,1.0,0.2
15,16,5.7,4.4,1.5,0.4
65,66,6.7,3.1,4.4,1.4
11,12,4.8,3.4,1.6,0.2
42,43,4.4,3.2,1.3,0.2


In [50]:
X_prepared = full_pipeline.fit_transform(X)
X_test_prepared = full_pipeline.fit_transform(X_test)

In [57]:
algorithms = ["DecisionTreeClassifier", "RandomForestClassifier"]
for a in algorithms:
    model = train_model(X_prepared, y, a)
    evaluate_model(model, X_test_prepared, y_test, False)

Training DecisionTreeClassifier...
Done
DecisionTreeClassifier -> Accuracy: 0.9666666666666667
Training RandomForestClassifier...
Done
RandomForestClassifier -> Accuracy: 0.9333333333333333


