# Predicting rain from class labels

## Imports

In [None]:
import pandas as pd
import numpy as np
import altair as alt
import math

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

## Data set up

In [None]:
SEED = 2660280232880537243 % 2**32

In [None]:
all_data = pd.read_csv('../data/predictions-fine-train.csv')

In [None]:
features = list(all_data.columns[3:-1])
precip = 'precipitation[mm]'
label = 'rain_binary'

In [None]:
all_data[label] = (all_data[precip] > 0).astype(int)

In [None]:
all_data.head()

In [None]:
X = all_data[features].to_numpy()
y = all_data[label].to_numpy()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

## Metrics

In [None]:
def get_metrics(y_true, y_pred, show=True):
    f1 = f1_score(y_val, y_pred, average='binary')
    acc = accuracy_score(y_val, y_pred)
    
    if show:
        print(f'F1 Score: {f1:0.3f}\nAccuracy: {acc:0.2%}')
    
    return {
        'F1': f1,
        'accuracy': acc
    }

## Random forest

In [None]:
# Results:
# {'max_depth': 32, 'min_samples_split': 4, 'n_estimators': 400}
#
# F1 Score: 0.728
# Accuracy: 74.19%
#
# param_grid = {
#     'max_depth': [2, 4, 8, 16, 32, 64],
#     'n_estimators': [50, 100, 200, 300, 400],
#     'min_samples_split': [2, 4, 8, 16, 32],
# }

# rf = RandomForestClassifier()
# clf_rf = GridSearchCV(rf, param_grid, cv=5)
# clf_rf.fit(X_train, y_train)
# clf_rf.best_params_

In [None]:
clf_rf = RandomForestClassifier(max_depth=32, min_samples_split=4, n_estimators=400)
clf_rf.fit(X_train, y_train)

In [None]:
y_pred = clf_rf.predict(X_val)

In [None]:
# F1 Score: 0.727
# Accuracy: 74.19%

_ = get_metrics(y_val, y_pred)

## Logistic Regression

In [None]:
# Results: {'C': 10.0, 'tol': 1e-07}
# F1 Score: 0.679
# Accuracy: 67.76%

param_grid = {
    'tol': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0],
}

logistic = linear_model.LogisticRegression()
clf_logistic = GridSearchCV(logistic, param_grid, cv=5)
clf_logistic.fit(X_train, y_train)
y_pred = clf_logistic.predict(X_val)
_ = get_metrics(y_val, y_pred)

In [None]:
list(zip(features, list(clf_logistic.best_estimator_.coef_[0])))

## SVM

In [None]:
# Results: {'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}
# param_grid = {
#     'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0],
#     'gamma': ['scale', 'auto'],
#     'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
# }

# svm = SVC()
# clf_svm = GridSearchCV(svm, param_grid, cv=5)
# clf_svm.fit(X_train, y_train)
# print(clf_svm.best_params_)

In [None]:
clf_svm = SVC(gamma='scale', kernel='rbf', C=10)
clf_svm.fit(X_train, y_train)

In [None]:
# F1 Score: 0.683
# Accuracy: 70.51%

y_pred = clf_svm.predict(X_val)
_ = get_metrics(y_val, y_pred)

## KNN Classifier

In [None]:
# Results: {'algorithm': 'auto', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
param_grid = {
    'n_neighbors': [1, 3, 5, 10, 15, 20, 25, 50],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2],
}
knn = KNeighborsClassifier()
clf_knn = GridSearchCV(knn, param_grid, cv=5)
clf_knn.fit(X_train, y_train)
print(clf_knn.best_params_)

In [None]:
# clf_knn = KNeighborsClassifier(n_neighbors=50, p=1, weights='distance')
# clf_knn.fit(X_train, y_train)
# F1 Score: 0.674
# Accuracy: 68.72%
y_pred = clf_knn.predict(X_val)
_ = get_metrics(y_val, y_pred)

## Adaboost

In [None]:
# Result: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 500}
# param_grid = {
#     'n_estimators': [10, 25, 50, 75, 100, 250, 500],
#     'learning_rate': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50],
#     'algorithm': ['SAMME', 'SAMME.R'],
# }
# ada = AdaBoostClassifier()
# clf_ada = GridSearchCV(ada, param_grid, cv=5)
# clf_ada.fit(X_train, y_train)
# print(clf_ada.best_params_)

In [None]:
clf_ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
clf_ada.fit(X_train, y_train)
y_pred = clf_ada.predict(X_val)
_ = get_metrics(y_val, y_pred)

## Neural Network

In [None]:
# Result: {
#     'activation': 'relu',
#     'alpha': 0.001,
#     'hidden_layer_sizes': (32, 64),
#     'learning_rate_init': 0.001, 'solver': 'adam'
# }
# param_grid = [
#     {
#         'hidden_layer_sizes': [(16,), (64,), (16, 16), (16, 32), (32, 64)],
#         'activation': ['logistic', 'tanh', 'relu'],
#         'solver': ['adam'],
#         'alpha': [0.00001, 0.0001, 0.001],
#         'learning_rate_init': [0.0001, 0.001, 0.01]
#     },
#     {
#         'hidden_layer_sizes': [(16,), (64,), (16, 16), (16, 32), (32, 64)],
#         'activation': ['logistic', 'tanh', 'relu'],
#         'solver': ['sgd'],
#         'learning_rate': ['constant', 'invscaling', 'adaptive'],
#         'alpha': [0.00001, 0.0001, 0.001],
#         'learning_rate_init': [0.0001, 0.001, 0.01],
#     },
#     {
#         'hidden_layer_sizes': [(16,), (64,), (16, 16), (16, 32), (32, 64)],
#         'activation': ['logistic', 'tanh', 'relu'],
#         'solver': ['lbfgs'],
#         'alpha': [0.00001, 0.0001, 0.001],
#     },
# ]
# mpl = MLPClassifier()
# clf_mlp = GridSearchCV(mpl, param_grid, cv=5)
# clf_mlp.fit(X_train, y_train)
# print(clf_mlp.best_params_)

In [None]:
clf_mlp = MLPClassifier(activation='relu',
                        alpha=0.001,
                        hidden_layer_sizes=(32, 64),
                        learning_rate_init=0.001,
                        solver='adam')

clf_mlp.fit(X_train, y_train)

In [None]:
y_pred = clf_mlp.predict(X_val)

In [None]:
_ = get_metrics(y_val, y_pred)