# Design Pattern 8: Cascade

> Aborda situações em que um problema de ML pode ser dividido de maneira lucrativa em uma série de problemas de ML.

### Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

### Base de dados

- Distance_meters: distancia percorrida em metros
- month_rental, day_rented, hour_rented: mês, dia e hora em que a pessoa alugou
- Duration_hour: tempo que durou o aluguel
- temp, wind e humid: é sobre o clima no dia do aluguel

In [2]:
df = pd.read_csv('data/seoul_bike_trip.csv')
print(f'Linhas: {df.shape[0]} | Colunas: {df.shape[1]}')
df.head()

Linhas: 10000 | Colunas: 10


Unnamed: 0,duration_hour,distance_meters,month_rental,day_rented,hour_rented,temp,wind,humid,distance_type,rental
0,8,2,12,11,5,-2.1,0.5,39.0,short,typical
1,16,2,12,14,23,-5.2,0.2,47.0,short,typical
2,22,4,12,7,13,-5.8,5.4,20.0,short,typical
3,10,5,12,13,16,0.5,4.2,35.0,short,typical
4,4,10,6,1,19,26.8,1.9,35.0,short,typical


### Passo 1: Modelo de classificação para definir a distância (curta ou longa)

In [3]:
X_dist = df.drop(columns=['distance_type', 'rental'])
y_dist = df['distance_type']

X_train_dist, X_test_dist, y_train_dist, y_test_dist = train_test_split(X_dist, y_dist, test_size=0.2, random_state=42)

In [4]:
pipeline_classify = Pipeline([
    ('vectorizer', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

_ = pipeline_classify.fit(X_train_dist, y_train_dist)

### Passo 2: Utilizando as previsões do modelo de classificação para criar os datasets de treinamento dos modelos seguintes

In [5]:
predictions = pipeline_classify.predict(X_test_dist)

X_test_concatenated = pd.concat([X_test_dist.reset_index(drop=True), pd.Series(predictions, name='Predictions')], axis=1)
X_test_concatenated.sample()

Unnamed: 0,duration_hour,distance_meters,month_rental,day_rented,hour_rented,temp,wind,humid,Predictions
160,2,160,6,7,21,21.5,2.6,69.0,short


In [6]:
rental_map = {'long': 'no_typical', 'short': 'typical'}
X_typical = X_test_concatenated.copy()
X_typical['rental'] = X_typical['Predictions'].map(rental_map)
X_typical.sample()

Unnamed: 0,duration_hour,distance_meters,month_rental,day_rented,hour_rented,temp,wind,humid,Predictions,rental
267,10,2280,9,10,8,20.1,1.6,57.0,short,typical


In [7]:
rental_map = {'long': 'unusual', 'short': 'no_unusual'}
X_unusual = X_test_concatenated.copy()
X_unusual['rental'] = X_unusual['Predictions'].map(rental_map)
X_unusual.sample()

Unnamed: 0,duration_hour,distance_meters,month_rental,day_rented,hour_rented,temp,wind,humid,Predictions,rental
216,20,3490,12,29,9,-9.8,2.4,26.0,short,no_unusual


### Passo 3: Treinamento do modelo para aluguel usual

In [8]:
X = X_typical.drop(['Predictions', 'rental'], axis=1)
y = X_typical['rental'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [9]:
pipeline_typical_model = Pipeline([
    ('vectorizer', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

_ = pipeline_typical_model.fit(X_train, y_train)

### Passo 3: Treinamento do modelo para aluguel incomum

In [10]:
X = X_unusual.drop(['Predictions', 'rental'], axis=1)
y = X_unusual['rental'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [11]:
pipeline_unusual_model = Pipeline([
    ('vectorizer', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

_ = pipeline_unusual_model.fit(X_train, y_train)

### Função para previsão da duração do aluguel

In [12]:
def predict_rental_duration(duration_hour: int, distance_meters: str, month_rental: str, day_rented: int, hour_rented: int, temp: int, wind: int, humid: int):
    rental_type = pipeline_classify.predict([[duration_hour, distance_meters, month_rental, day_rented, hour_rented, temp, wind, humid]])
    
    if rental_type == 'short':
        return pipeline_typical_model.predict([[duration_hour, distance_meters, month_rental, day_rented, hour_rented, temp, wind, humid]])[0]
    else:
        return pipeline_unusual_model.predict([[duration_hour, distance_meters, month_rental, day_rented, hour_rented, temp, wind, humid]])[0]


print('Exemplo de previsão típico:', predict_rental_duration(5.5, 6.8, 3, 4, 8, 3.9, 0.2, 40.))
print('Exemplo de previsão incomum:', predict_rental_duration(200, 50000, 3, 4, 8, 3.9, 0.2, 40.))

Exemplo de previsão típico: typical
Exemplo de previsão incomum: unusual
