In [1]:
 %load_ext autoreload
%autoreload 2

In [2]:
from pipeline import eval_pipeline, HierarchicalImputer, CoordinateTransformer, WindDirectionTransformer, DropColumnsTransformer, RainTodayTransformer, ExpandDateTransformer, sample, report_results, ShapeDebugger, BinningTransformer, LabelBinarizerPipelineFriendly
from persistence import load_or_fit
# For data manipulation
import pandas as pd
import numpy as np
import pandas as pd
# For visualization
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import joblib

In [3]:
rains = pd.read_csv('../data/weatherAUS.csv')
rains = rains.dropna(subset=['RainTomorrow'])
# Convierto la variable 'Date' a formato tipo "datetime" de Pandas

rains_x = rains.drop(columns=['RainTomorrow'])  # Drop the target column from features
rains_y = np.where(rains['RainTomorrow'] == "Yes", 1, 0)                 # Target variable

# Split the dataset
rnd_seed = 42   # For reproducibility         
ftest = 0.20    # Fracción de datos para "train"

X_train, X_test, y_train, y_test = train_test_split(rains_x, rains_y,
                                                    test_size=ftest,
                                                    stratify=rains_y,
                                                    random_state=rnd_seed)

print("Dimension de X_train:",X_train.shape)
print("Valores de y_train:",y_train.size)
print("Dimension de X_test:",X_test.shape)
print("Valores de y_test:",len(y_test))


Dimension de X_train: (113754, 22)
Valores de y_train: 113754
Dimension de X_test: (28439, 22)
Valores de y_test: 28439


In [4]:
coordinates = pd.read_csv("../data/worldcities.csv")
coordinates["Location"] = coordinates["city"]
coordinates.drop(columns=["city"], inplace = True)
coordinates = coordinates[coordinates["country"] == "Australia"]

city_coords = {
    'Albury': (-36.0785, 146.9136),
    'BadgerysCreek': (-33.8813, 150.7282),
    'Cobar': (-31.8667, 145.7667),
    'CoffsHarbour': (-30.3026, 153.1137),
    'Moree': (-29.4706, 149.8392),
    'Newcastle': (-32.9283, 151.7817),
    'NorahHead': (-33.2202, 151.5433),
    'NorfolkIsland': (-29.0408, 167.9541),
    'Penrith': (-33.7675, 150.6931),
    'Richmond': (-33.5982, 150.7581),
    'Sydney': (-33.8688, 151.2093),
    'SydneyAirport': (-33.9399, 151.1753),
    'WaggaWagga': (-35.0433, 147.3587),
    'Williamtown': (-32.7951, 151.8118),
    'Wollongong': (-34.4278, 150.8931),
    'Canberra': (-35.2809, 149.1300),
    'Tuggeranong': (-35.4167, 149.1000),
    'MountGinini': (-35.4471, 148.9685),
    'Ballarat': (-37.5622, 143.8503),
    'Bendigo': (-36.7582, 144.2814),
    'Sale': (-38.1100, 147.0737),
    'MelbourneAirport': (-37.6692, 144.8411),
    'Melbourne': (-37.8136, 144.9631),
    'Mildura': (-34.1850, 142.1625),
    'Nhil': (-35.2060, 141.6450),
    'Portland': (-38.3516, 141.5878),
    'Watsonia': (-37.7139, 145.0875),
    'Dartmoor': (-37.7251, 141.2843),
    'Brisbane': (-27.4698, 153.0251),
    'Cairns': (-16.9203, 145.7710),
    'GoldCoast': (-28.0167, 153.4000),
    'Townsville': (-19.2589, 146.8183),
    'Adelaide': (-34.9285, 138.6007),
    'MountGambier': (-37.8321, 140.7807),
    'Nuriootpa': (-34.4973, 138.9966),
    'Woomera': (-31.1395, 136.7984),
    'Albany': (-35.0285, 117.8837),
    'Witchcliffe': (-33.7015, 115.0911),
    'PearceRAAF': (-31.9131, 115.9741),
    'PerthAirport': (-31.9402, 115.9676),
    'Perth': (-31.9505, 115.8605),
    'SalmonGums': (-33.3937, 121.2060),
    'Walpole': (-34.9639, 115.8106),
    'Hobart': (-42.8821, 147.3272),
    'Launceston': (-41.4391, 147.1349),
    'AliceSprings': (-23.6980, 133.8807),
    'Darwin': (-12.4634, 130.8456),
    'Katherine': (-14.4686, 132.2678),
    'Uluru': (-25.3444, 131.0369)
}

In [5]:

rf_pipeline =  Pipeline(steps = [
    ("rfdate_expander", ExpandDateTransformer()),
    ("imputer", HierarchicalImputer()),
    ("rain_today", RainTodayTransformer()),
    ("coordinates", CoordinateTransformer(coordinates.drop_duplicates(subset="Location"), city_coords)),
    ("wind_direction", WindDirectionTransformer()),
    ("drop_directions", DropColumnsTransformer(columns=["WindGustDir", "WindDir9am", "WindDir3pm"])),
    ("drop_date_location", DropColumnsTransformer(columns=["Date","Location"])),
    ("scaler", MinMaxScaler()),
    ("model_rf", RandomForestClassifier(random_state=rnd_seed))
])

rain_today_pipeline = Pipeline(steps = [
      ("rain_today_imputer", SimpleImputer(fill_value='No', strategy="constant")),
      ("rain_today_print()rizer", LabelBinarizerPipelineFriendly()),
])

param_grid = {
    'model_rf__n_estimators': [100, 200, 500],  # Number of trees in the forest
    'model_rf__max_depth': [None, 10, 20],     # Depth of the trees
    'model_rf__min_samples_split': [2, 5, 10], # Minimum samples required to split an internal node
    'model_rf__min_samples_leaf': [1, 2, 4],   # Minimum samples required at a leaf node
    'model_rf__bootstrap': [False]       # Use bootstrap samples when building trees
}

clf = GridSearchCV(rf_pipeline, param_grid = param_grid, cv = 5, scoring = 'f1', verbose = 1, n_jobs=12)

x,y = sample(X_train, y_train, sample=0.1)
gs_fitted = load_or_fit(clf, x, y, name="num-minmax-rf")
y_pred = gs_fitted.predict(X_train)
report_results(y_train, y_pred)
print(gs_fitted.best_params_)
print(gs_fitted.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


Saving: num-minmax-rf-4631.pkl


Accuracy: 0.8658772438771384

 Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.92     88252
           1       0.77      0.57      0.66     25502

    accuracy                           0.87    113754
   macro avg       0.83      0.76      0.79    113754
weighted avg       0.86      0.87      0.86    113754


 Roc auc Report:
 0.7607558949023545
{'model_rf__bootstrap': False, 'model_rf__max_depth': None, 'model_rf__min_samples_leaf': 1, 'model_rf__min_samples_split': 2, 'model_rf__n_estimators': 500}
0.5920110508153047


In [18]:
rf_pipeline =  Pipeline(steps = [
    ("rfdate_expander", ExpandDateTransformer()),
    ("imputer", HierarchicalImputer()),
    ("rain_today", RainTodayTransformer()),
    ("coordinates", CoordinateTransformer(coordinates.drop_duplicates(subset="Location"), city_coords)),
    ("wind_direction", WindDirectionTransformer()),
    ("drop_directions", DropColumnsTransformer(columns=["WindGustDir", "WindDir9am", "WindDir3pm"])),
    ("drop_date_location", DropColumnsTransformer(columns=["Date","Location"])),
    ("model_rf", RandomForestClassifier(random_state=rnd_seed))
])

param_grid = {
    'model_rf__n_estimators': [100],  # Number of trees in the forest
    'model_rf__max_depth': [20],     # Depth of the trees
    'model_rf__min_samples_split': [2], # Minimum samples required to split an internal node
    'model_rf__min_samples_leaf': [1],   # Minimum samples required at a leaf node
    'model_rf__bootstrap': [False]       # Use bootstrap samples when building trees
}

clf = GridSearchCV(rf_pipeline, param_grid = param_grid, cv = 5, scoring = 'f1', verbose = 1, n_jobs=12)

x,y = sample(X_train, y_train, sample)
gs_fitted = load_or_fit(clf, x, y, name="num-minmax-rf", force=True)
y_pred = gs_fitted.predict(X_train)
report_results(y_train, y_pred)
print(gs_fitted.best_params_)
print(gs_fitted.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Saving: num-minmax-rf-4331.pkl
Accuracy: 0.8647432178209118

 Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.92     88252
           1       0.77      0.56      0.65     25502

    accuracy                           0.86    113754
   macro avg       0.83      0.76      0.78    113754
weighted avg       0.86      0.86      0.86    113754


 Roc auc Report:
 0.7562471004875413
{'model_rf__bootstrap': False, 'model_rf__max_depth': 20, 'model_rf__min_samples_leaf': 1, 'model_rf__min_samples_split': 2, 'model_rf__n_estimators': 100}
0.6041769212542405


In [1]:
results = pd.DataFrame(gs_fitted.cv_results_).sort_values(by='rank_test_score', ascending=True)
print(results[results["rank_test_score"] <= 10]['params'].values)

NameError: name 'pd' is not defined

In [None]:
from sklearn import tree
plt.figure(figsize=(25,25))
tree.plot_tree(gs_fitted.best_estimator_.named_steps['model_rf'].estimators_[0], max_depth=3, feature_names=['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainToday', 'Year', 'Month', 'RainTodayNull',
       'lat', 'lng', 'WindGustDirDegSin', 'WindDir9amDegSin', 'WindDir3pmDegSin',
       'WindGustDirDegSin', 'WindDir9amDegSin', 'WindDir3pmDegSin'])

In [8]:
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"))
])

numerical_transformer = Pipeline(steps=[
    ("minmax", MinMaxScaler())
])


transform_columns = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, ["Location","WindGustDir", "WindDir9am", "WindDir3pm"]),
        ("num", numerical_transformer, ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustSpeed',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'Year', 'Month'])
    ]
)


ohe_pipeline = Pipeline(steps = [
    ("date_expander", ExpandDateTransformer()),
    ("imputer", HierarchicalImputer()),
    ("rain_today", RainTodayTransformer()),
    ("drop_date", DropColumnsTransformer(columns=["Date"])),
    ("preproc", transform_columns),
    ("model_rf", RandomForestClassifier(random_state=rnd_seed))
], memory="pipeline_cache")

param_grid = {
    'model_rf__n_estimators': [100, 200, 500],  # Number of trees in the forest
    'model_rf__max_depth': [None, 10, 20],     # Depth of the trees
    'model_rf__min_samples_split': [2, 5, 10], # Minimum samples required to split an internal node
    'model_rf__min_samples_leaf': [1, 2, 4],   # Minimum samples required at a leaf node
    'model_rf__bootstrap': [False]       # Use bootstrap samples when building trees
}


clf = GridSearchCV(ohe_pipeline, param_grid = param_grid, cv = 5, scoring = 'f1', verbose = 1, n_jobs=12)

x,y = sample(X_train, y_train, sample=0.1)
gs_fitted = load_or_fit(clf, x, y, name="num-minmax-rf")
y_pred = gs_fitted.predict(X_train)
report_results(y_train, y_pred)
print(gs_fitted.best_params_)
print(gs_fitted.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


Saving: num-minmax-rf-9373.pkl


Accuracy: 0.863855336955184

 Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.92     88252
           1       0.80      0.52      0.63     25502

    accuracy                           0.86    113754
   macro avg       0.84      0.74      0.77    113754
weighted avg       0.86      0.86      0.85    113754


 Roc auc Report:
 0.7429191245581233
{'model_rf__bootstrap': False, 'model_rf__max_depth': None, 'model_rf__min_samples_leaf': 1, 'model_rf__min_samples_split': 2, 'model_rf__n_estimators': 200}
0.5740949891606003


In [5]:
bag_pipeline =  Pipeline(steps = [
    ("date_expander", ExpandDateTransformer()),
    ("imputer", HierarchicalImputer()),
    ("rain_today", RainTodayTransformer()),
    ("coordinates", CoordinateTransformer(coordinates.drop_duplicates(subset="Location"), city_coords)),
    ("wind_direction", WindDirectionTransformer()),
    ("drop_directions", DropColumnsTransformer(columns=["WindGustDir", "WindDir9am", "WindDir3pm"])),
    ("drop_date_location", DropColumnsTransformer(columns=["Date","Location"])),
    ("scaler", MinMaxScaler()),
    ("model_bag_log", BaggingClassifier(
        estimator=LogisticRegression(max_iter=1000, random_state=42, class_weight={1: 0.75, 0: 0.25}, penalty='l2'),
        n_estimators=10,
        random_state=42
    )),
])

param_grid = {
    'model_bag_log__n_estimators': [10, 20, 50],       # Number of base estimators
    'model_bag_log__max_samples': [0.5, 0.75, 1.0],   # Proportion of samples for training each base estimator
    'model_bag_log__max_features': [0.5, 0.75, 1.0],  # Proportion of features for training each base estimator
    'model_bag_log__bootstrap': [True, False],        # Bootstrap sampling
    'model_bag_log__bootstrap_features': [True, False] # Bootstrap sampling for features
}

clf = GridSearchCV(bag_pipeline, param_grid = param_grid, cv = 5, scoring = 'f1', verbose = 1, n_jobs=12)

x,y = sample(X_train, y_train, sample=0.1)
gs_fitted = load_or_fit(clf, x, y, name="num-minmax-rf")
y_pred = gs_fitted.predict(X_train)
report_results(y_train, y_pred)
print(gs_fitted.best_params_)
print(gs_fitted.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Saving: num-minmax-rf-1705.pkl


Accuracy: 0.792482022610194

 Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.81      0.86     88252
           1       0.53      0.75      0.62     25502

    accuracy                           0.79    113754
   macro avg       0.72      0.78      0.74    113754
weighted avg       0.83      0.79      0.80    113754


 Roc auc Report:
 0.7769537912146569
{'model_bag_log__bootstrap': False, 'model_bag_log__bootstrap_features': False, 'model_bag_log__max_features': 1.0, 'model_bag_log__max_samples': 1.0, 'model_bag_log__n_estimators': 10}
0.6314680163158556
