# Zadanie rekrutascyjne - Data Scientist - 3Soft

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import random 
import multiprocessing as mp

## Zadanie 1
Napisz funkcję “dt_pred”, która w oparciu o drzewo decyzyjne na podstawie losowego podzbioru obserwacji oraz losowego zestawu atrybutów ze zbioru uczącego zwraca wytrenowanie na podanym zbiorze treningowym drzewo decyzyjne. Odsetek obserwacji oraz odsetek atrybutów wykorzystywanych do uczenia drzewa powinny zostać sparametryzowane. Zadbaj o powtarzalność otrzymywanych wyników. 

In [2]:
df_descriptive = pd.read_json('data/descriptive_v2.json').ffill()

In [30]:
params = {
        'DecisionTree':{
            'per_size_training_set': 0.8,
            'per_size_num_of_atribute': 0.8,
            'criterion': "squared_error",
            'max_depth': 10,
            'min_samples_split': 4
        },
        'RandomForest':{
            'n_trees': 100
        },
        'test_size': 150
        }

In [31]:
def dt_pred(X: pd.DataFrame, 
            y: pd.DataFrame,
            params: dict,
            seed: int=3):
    """
    
    
    """
    # Security
    if params['DecisionTree']['per_size_training_set'] < 0 or params['DecisionTree']['per_size_training_set'] > 1:
        raise ValueError("Feature `per_size_training_set` must have a value between 0 and 1.")
    if params['DecisionTree']['per_size_num_of_atribute'] < 0 or params['DecisionTree']['per_size_num_of_atribute'] > 1:
        raise ValueError("Feature `per_size_num_of_atribute` must have a value between 0 and 1.")
    # Data preparation
    np.random.seed(seed)
    feature_col = np.random.choice(X.columns.values, 
                        round(params['DecisionTree']['per_size_num_of_atribute']*len(X.columns.values)), 
                        replace=False)
    X = X[feature_col]
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        train_size=params['DecisionTree']['per_size_training_set'], 
                                                        random_state=seed
                                                        )
    # Train model
    modelTree = DecisionTreeRegressor(criterion=params['DecisionTree']['criterion'], 
                                      max_depth=params['DecisionTree']['max_depth'],
                                      min_samples_split=params['DecisionTree']['min_samples_split'],
                                      random_state=seed)
    modelTree.fit(X_train, y_train)
    return modelTree, feature_col

## Zadanie 2
Następnie skonstruuj funkcję “dt_bagg”, która wykorzystywać będzie funkcję “dt_pred” (jako tzw. weak learner) w procedurze baggingu*. Pamiętaj o uwzględnieniu odpowiednich (hiper)parametrów tej funkcji. Jeżeli to możliwe postaraj się zrównoleglić obliczenia. 

In [32]:
class dt_bagg:
    def __init__(self,
                data: pd.DataFrame, 
                params: dict, 
                seed: int=3):
        self.data = data
        self.params = params
        self.seed = seed
        # Security
        if(self.params['RandomForest']['n_trees'] < 1):
            raise ValueError(f"Feature `n_trees` must have a integer greater or equal 1.")

    def prep_data(self):   
        X = self.data.drop(['y'], axis=1)
        y = self.data['y'].values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, 
                                                            y, 
                                                            test_size=float(self.params['test_size']/len(X)), 
                                                            random_state=self.seed
                                                            )
        print('Data prepared!')

    def fit(self):
        np.random.seed(self.seed)
        list_of_seed = np.random.randint(0,10000, int(self.params['RandomForest']['n_trees']))
        self.trainedTree = [dt_pred(self.X_train, self.y_train, self.params, dt_random_state) for dt_random_state in list_of_seed]
        print('Models prepared!')

    def predict(self):
        self.y_pred = np.median([self.trainedTree[idx][0].predict(self.X_test[self.trainedTree[idx][1]]) for idx in range(len(self.trainedTree))], axis=0)
        print('Predictions prepared!')


In [33]:
rf = dt_bagg(df_descriptive, params=params, seed=42)

In [34]:
rf.prep_data()
rf.fit()
rf.predict()

Data prepared!
Models prepared!
Predictions prepared!


In [35]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

def print_values(rmse, mae, r2):
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

In [36]:
rmse, mae, r2 = eval_metrics(rf.y_test, rf.y_pred)
print_values(rmse, mae, r2)

  RMSE: 3.1911596623531184
  MAE: 2.1367559078945124
  R2: 0.3440814433860513


## Porównanie do BaggingRegression

In [40]:
seed = 42
bag_model = BaggingRegressor(
    base_estimator=DecisionTreeRegressor(criterion=params['DecisionTree']['criterion'], 
                                        max_depth=params['DecisionTree']['max_depth'],
                                        min_samples_split=params['DecisionTree']['min_samples_split'],
                                        random_state=seed), 
    n_estimators=100, 
    max_samples=0.8, 
    max_features=0.8,
    bootstrap=True,
    oob_score=True,
    random_state=42
)

In [41]:
bag_model.fit(rf.X_train, rf.y_train)

In [42]:
bag_pred = bag_model.predict(rf.X_test)

In [43]:
rmse, mae, r2 = eval_metrics(rf.y_test, bag_pred)
print_values(rmse, mae, r2)

  RMSE: 3.307862141765845
  MAE: 2.265944693091466
  R2: 0.2952296032323086
