In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
from pipeline import eval_pipeline, HierarchicalImputer, CoordinateTransformer, WindDirectionTransformer, DropColumnsTransformer, RainTodayTransformer, ExpandDateTransformer, sample, report_results
from persistence import load_or_fit
# For data manipulation
import pandas as pd
import numpy as np
import pandas as pd
# For visualization
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
import joblib

In [4]:
rains = pd.read_csv('../data/weatherAUS.csv')
rains = rains.dropna(subset=['RainTomorrow'])
# Convierto la variable 'Date' a formato tipo "datetime" de Pandas

rains_x = rains.drop(columns=['RainTomorrow'])  # Drop the target column from features
rains_y = np.where(rains['RainTomorrow'] == "Yes", 1, 0)                 # Target variable

# Split the dataset
rnd_seed = 42   # For reproducibility         
ftest = 0.20    # Fracción de datos para "train"

X_train, X_test, y_train, y_test = train_test_split(rains_x, rains_y,
                                                    test_size=ftest,
                                                    stratify=rains_y,
                                                    random_state=rnd_seed)

print("Dimension de X_train:",X_train.shape)
print("Valores de y_train:",y_train.size)
print("Dimension de X_test:",X_test.shape)
print("Valores de y_test:",len(y_test))

Dimension de X_train: (113754, 22)
Valores de y_train: 113754
Dimension de X_test: (28439, 22)
Valores de y_test: 28439


In [5]:
coordinates = pd.read_csv("../data/worldcities.csv")
coordinates["Location"] = coordinates["city"]
coordinates.drop(columns=["city"], inplace = True)
coordinates = coordinates[coordinates["country"] == "Australia"]

city_coords = {
    'Albury': (-36.0785, 146.9136),
    'BadgerysCreek': (-33.8813, 150.7282),
    'Cobar': (-31.8667, 145.7667),
    'CoffsHarbour': (-30.3026, 153.1137),
    'Moree': (-29.4706, 149.8392),
    'Newcastle': (-32.9283, 151.7817),
    'NorahHead': (-33.2202, 151.5433),
    'NorfolkIsland': (-29.0408, 167.9541),
    'Penrith': (-33.7675, 150.6931),
    'Richmond': (-33.5982, 150.7581),
    'Sydney': (-33.8688, 151.2093),
    'SydneyAirport': (-33.9399, 151.1753),
    'WaggaWagga': (-35.0433, 147.3587),
    'Williamtown': (-32.7951, 151.8118),
    'Wollongong': (-34.4278, 150.8931),
    'Canberra': (-35.2809, 149.1300),
    'Tuggeranong': (-35.4167, 149.1000),
    'MountGinini': (-35.4471, 148.9685),
    'Ballarat': (-37.5622, 143.8503),
    'Bendigo': (-36.7582, 144.2814),
    'Sale': (-38.1100, 147.0737),
    'MelbourneAirport': (-37.6692, 144.8411),
    'Melbourne': (-37.8136, 144.9631),
    'Mildura': (-34.1850, 142.1625),
    'Nhil': (-35.2060, 141.6450),
    'Portland': (-38.3516, 141.5878),
    'Watsonia': (-37.7139, 145.0875),
    'Dartmoor': (-37.7251, 141.2843),
    'Brisbane': (-27.4698, 153.0251),
    'Cairns': (-16.9203, 145.7710),
    'GoldCoast': (-28.0167, 153.4000),
    'Townsville': (-19.2589, 146.8183),
    'Adelaide': (-34.9285, 138.6007),
    'MountGambier': (-37.8321, 140.7807),
    'Nuriootpa': (-34.4973, 138.9966),
    'Woomera': (-31.1395, 136.7984),
    'Albany': (-35.0285, 117.8837),
    'Witchcliffe': (-33.7015, 115.0911),
    'PearceRAAF': (-31.9131, 115.9741),
    'PerthAirport': (-31.9402, 115.9676),
    'Perth': (-31.9505, 115.8605),
    'SalmonGums': (-33.3937, 121.2060),
    'Walpole': (-34.9639, 115.8106),
    'Hobart': (-42.8821, 147.3272),
    'Launceston': (-41.4391, 147.1349),
    'AliceSprings': (-23.6980, 133.8807),
    'Darwin': (-12.4634, 130.8456),
    'Katherine': (-14.4686, 132.2678),
    'Uluru': (-25.3444, 131.0369)
}

# Categorical => numerical

Se transforman las direcciones cardinales en grados, ej NE -> 0.22

Se transforma la ubicacion (Location) en latitud y longitud, ej Albury => (-36.0785, 146.9136)

In [37]:
num_minmax_logit =  Pipeline(steps = [
    ("date_expander", ExpandDateTransformer()),
    ("imputer", HierarchicalImputer()),
    ("rain_today", RainTodayTransformer()),
    ("coordinates", CoordinateTransformer(coordinates.drop_duplicates(subset="Location"), city_coords)),
    ("wind_direction", WindDirectionTransformer()),
    ("drop_directions", DropColumnsTransformer(columns=["WindGustDir", "WindDir9am", "WindDir3pm"])),
    ("drop_date_location", DropColumnsTransformer(columns=["Date","Location"])),
    ("scaler", MinMaxScaler()),
    ("logistic_regression", LogisticRegression())
])


parameters = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'logistic_regression__class_weight':['balanced', {1: '0.25', 0: '0.75'}],
    'logistic_regression__penalty':['l2']
}

clf = GridSearchCV(num_minmax_logit, param_grid = parameters, cv = 5, scoring = 'f1', verbose = 1, n_jobs=12)

x,y = sample(X_train, y_train, sample=0.1)
gs_fitted = load_or_fit(clf, x, y, name="num-minmax-logit", force=True)
print(gs_fitted.best_params_)
print(gs_fitted.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


Saving: num-minmax-logit-5745.pkl
{'logistic_regression__class_weight': 'balanced', 'logistic_regression__penalty': 'l2', 'minmax': MinMaxScaler()}
0.46817618050609866


In [30]:
report_results(y_train, gs_fitted.predict(X_train))

Accuracy: 0.7043620444116251

 Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.74      0.79     88252
           1       0.39      0.60      0.47     25502

    accuracy                           0.70    113754
   macro avg       0.63      0.67      0.63    113754
weighted avg       0.76      0.70      0.72    113754


 Roc auc Report:
 0.6659185690659837


# One hot encoding categorical features


In [18]:


categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"))
])

numerical_transformer = Pipeline(steps=[
    ("minmax", StandardScaler())
])


transform_columns = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, ["Location","WindGustDir", "WindDir9am", "WindDir3pm"]),
        ("num", numerical_transformer, ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustSpeed',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'Year', 'Month'])
    ]
)


ohe_pipeline = Pipeline(steps = [
    ("date_expander", ExpandDateTransformer()),
    ("imputer", HierarchicalImputer()),
    ("rain_today", RainTodayTransformer()),
    ("drop_date", DropColumnsTransformer(columns=["Date"])),
    ("preproc", transform_columns),
    ("logistic_regression", LogisticRegression())
], memory="pipeline_cache")


In [19]:
parameters = {
    'logistic_regression__class_weight':['balanced', {'1': '0.25', '0': '0.75'}],
    'logistic_regression__penalty':['l1','l2']
}
clf = GridSearchCV(ohe_pipeline, param_grid = parameters, cv = 5, scoring = 'f1', verbose = 1, n_jobs=12)

x,y = sample(X_train, y_train)
gs_fitted = load_or_fit(clf, x, y, name="ohe-logit")


In [38]:
print(gs_fitted.best_params_)
print(gs_fitted.best_score_)
print(gs_fitted.scorer_)

{'logistic_regression__class_weight': 'balanced', 'logistic_regression__penalty': 'l2'}
0.40952610315967997
make_scorer(f1_score, response_method='predict', average=binary)


In [40]:
report_results(y_train, gs_fitted.predict(X_train))

Accuracy: 0.603600752501011

 Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.60      0.70     88252
           1       0.31      0.61      0.41     25502

    accuracy                           0.60    113754
   macro avg       0.57      0.61      0.55    113754
weighted avg       0.72      0.60      0.64    113754


 Roc auc Report:
 0.6055102620644734
