# Random Forest Model using sklearn

In [24]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle

from sklearn.ensemble import RandomForestClassifier

# To evaluate end result we have 
from sklearn.metrics import (accuracy_score, 
                            confusion_matrix,
                            mean_squared_error, 
                            r2_score, 
                            mean_absolute_error)
from sklearn.model_selection import cross_val_score

import dvc.api
import io

import sys
sys.path.append("../")
from Scripts.exploration import Analysis
from Scripts.cleaning import CleanDataFrame
from Scripts.visualization import Plotters

analyzer = Analysis()
cleaner = CleanDataFrame()
plotter = Plotters(w=7, h=5)

## Steps to get the data

1. Make sure you are in sync with the latest main branch.
2. run `dvc pull` to get the latest data versions
3. In the next cell change the `version` and `path` to access the file you want.

you can find the file names by exploring the data folder.
For the versions, follow this:

 - all files starting with `browser_` can be accessed starting from version=v1.1.1
 - all files starting with `brand_` can be accessed starting from version=v1.1.2
 - you can find the cleaned data with `device_make` converted to `brands` at version=v1.1
 - you can find the cleaned raw data at version=v1
 - you can find the raw data at version=v0
  

The last 3 files have the same name `AdSmartABdata.csv`

In [21]:
path = 'data/brand_generic.csv'
repo = "../"
version = 'v1.1.2'

f = dvc.api.read(path=path, 
                repo=repo, 
                rev=version)

type(f)

df = pd.read_csv(io.StringIO(f), sep=",")
# I should fix this in the next version of the files
df.drop(columns=['Unnamed: 0'], inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   experiment   795 non-null    object
 1   hour         795 non-null    int64 
 2   platform_os  795 non-null    int64 
 3   browser      795 non-null    object
 4   day_of_week  795 non-null    object
 5   brand        795 non-null    object
 6   response     795 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 43.6+ KB


In [33]:
cat_cols = cleaner.get_categorical_columns(df)
num_cols = cleaner.get_numerical_columns(df)[:-1]   # Remove the target column

display(cat_cols)
display(num_cols)

['experiment', 'browser', 'day_of_week', 'brand']

['hour', 'platform_os']

In [34]:
# I will move this to a script file
# def get_col_transformer(num_cols=None, cat_cols=None):
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=15))
])
numerical_transformer = Pipeline(steps=[
    ('scale', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])
    
    # return preprocessor



In [42]:
my_pipeline = Pipeline(steps=[
                            ('preprocessor', preprocessor),
                            ('model', RandomForestClassifier())
                            ])

In [31]:
X = df.drop(columns=['response'])
y = df['response'].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.25)
print(f"Train shape: {X_train.shape}")
print(f"Valid shape: {X_valid.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (556, 6)
Valid shape: (179, 6)
Test shape: (60, 6)


In [43]:
clf = my_pipeline.fit(X=X_train, y=y_train)

In [47]:
preds = clf.predict(X_valid)
r2 = r2_score(y_valid, preds)
acc = accuracy_score(y_valid, preds)
acc

0.5251396648044693

In [49]:
mae = mean_absolute_error(y_valid, preds)
mae

0.4748603351955307