# Predicting rain from SPL

## Imports

In [None]:
import pandas as pd
import numpy as np
import altair as alt
import math

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

## Data set up

In [None]:
SEED = 2660280232880537243 % 2**32

In [None]:
all_train_data = pd.read_pickle('../data/spl-train.pkl')
all_test_data = pd.read_pickle('../data/spl-test.pkl')

In [None]:
all_train_data.head(2)

In [None]:
features =  ['spl_mean', 'spl_std', 'spl_l2diff', 'spl_entropy']
precip = 'precipitation[mm]'
label = 'rain_binary'

In [None]:
all_train_data[label] = (all_train_data[precip] > 0).astype(int)
all_test_data[label] = (all_test_data[precip] > 0).astype(int)

In [None]:
X_train = all_train_data[features].to_numpy()
y_train = all_train_data[label].to_numpy()

In [None]:
X_test = all_test_data[features].to_numpy()
y_test = all_test_data[label].to_numpy()

## Metrics

In [None]:
def get_metrics(y_true, y_pred, show=True):
    f1 = f1_score(y_true, y_pred, average='binary')
    acc = accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    if show:
        print(f'F1 Score: {f1:0.3f}')
        print(f'Accuracy: {acc:0.2%}')
        print(f'TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}')
    
    return {
        'F1': f1,
        'Accuracy': acc,
        'Confusion Matrix': [tn, fp, fn, tp],
    }

## Random forest

### Best parameters:

```
{'max_depth': 8, 'min_samples_split': 4, 'n_estimators': 200}
```

### Test results:

F1 Score: 0.543

Accuracy: 59.91%

TN: 4111, FP: 1621, FN: 2949, TP: 2719

In [None]:
param_grid = {
    'max_depth': [2, 4, 8, 16, 32, 64],
    'n_estimators': [50, 100, 200, 300, 400],
    'min_samples_split': [2, 4, 8, 16, 32],
}

rf = RandomForestClassifier()
clf_rf = GridSearchCV(rf, param_grid, cv=5, verbose=2)
clf_rf.fit(X_train, y_train)

print(clf_rf.best_params_)
print(clf_rf.best_score_)

In [None]:
y_pred_rf = clf_rf.predict(X_test)

In [None]:
_ = get_metrics(y_test, y_pred_rf)

## SVM

### Best parameters:
```
{'C': 0.5, 'gamma': 'auto', 'kernel': 'rbf'}
```

### Test results:
F1 Score: 0.556

Accuracy: 59.78%

TN: 3948, FP: 1784, FN: 2801, TP: 2867

In [None]:
param_grid = {
    'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'sigmoid']
}
svm = SVC()
clf_svm = GridSearchCV(svm, param_grid, cv=5, verbose=2)
clf_svm.fit(X_train, y_train)
print(clf_svm.best_params_)

In [None]:
clf_svm.best_score_

In [None]:
y_pred_svm = clf_svm.predict(X_test)
_ = get_metrics(y_test, y_pred_svm)

## KNN Classifier

### Best parameters:
```
{'algorithm': 'auto', 'n_neighbors': 500, 'p': 1, 'weights': 'distance'}
```

### Test results:

F1 Score: 0.545

Accuracy: 59.92%

TN: 4097, FP: 1635, FN: 2934, TP: 2734

In [None]:
param_grid = {
    'n_neighbors': [1, 5, 10, 50, 100, 500, 1000],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2],
}
knn = KNeighborsClassifier()
clf_knn = GridSearchCV(knn, param_grid, cv=5, verbose=2)
clf_knn.fit(X_train, y_train)
print(clf_knn.best_params_)
print(clf_knn.best_score_)

In [None]:
y_pred_knn = clf_knn.predict(X_test)
_ = get_metrics(y_test, y_pred_knn)

## Neural Network

### Best parameters:
```
{
    'activation': 'tanh',
    'alpha': 0.001,
    'hidden_layer_sizes': (16, 16, 16),
    'learning_rate_init': 0.001,
    'solver': 'adam'
}
```

### Test results:
F1 Score: 0.549

Accuracy: 59.32%

TN: 3941, FP: 1791, FN: 2847, TP: 2821

In [None]:
param_grid = [
    {
        'hidden_layer_sizes': [(32,), (16, 16), (32, 64), (16, 16, 16)],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate_init': [0.0001, 0.001, 0.01]
    },
]
mpl = MLPClassifier()
clf_mlp = GridSearchCV(mpl, param_grid, cv=5, verbose=2)
clf_mlp.fit(X_train, y_train)
print(clf_mlp.best_params_)

In [None]:
print(clf_mlp.best_score_)

In [None]:
y_pred_mlp = clf_mlp.predict(X_test)

In [None]:
_ = get_metrics(y_test, y_pred_mlp)