In [23]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans

from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('HURdat_ExtremeWeatherEvents.csv')

In [3]:
data.head()
data.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
data.columns

Index(['ID', 'Name', 'Status', 'Latitude', 'Longitude', 'Maximum.Wind',
       'date_time', 'diff', 'rapid_int', 'i', 'n', 'persistence', 'product',
       'Initial.Max', 'speed', 'speed_z', 'speed_m', 'Jday', 'Maximum.Wind_p',
       'Latitude_p', 'Longitude_p'],
      dtype='object')

In [5]:
X = data[['rapid_int', 'product', 'persistence', 'Maximum.Wind_p', 'Maximum.Wind', 'i', 'speed_m', 'speed_z', 'diff', 'speed']].dropna(how='all')
X['product'] = X['product'].fillna(X['product'].median())
X['persistence'] = X['persistence'].fillna(X['persistence'].median())
X['Maximum.Wind_p'] = X['Maximum.Wind_p'].fillna(X['Maximum.Wind_p'].median())
X['speed_m'] = X['speed_m'].fillna(X['speed_m'].median())
X['speed_z'] = X['speed_z'].fillna(X['speed_z'].median())
X['diff'] = X['diff'].fillna(X['diff'].median())
X['speed'] = X['speed'].fillna(X['speed'].median())


In [6]:
X.shape

(8793, 10)

In [7]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X[['product', 'persistence', 'Maximum.Wind_p']])
len(kmeans.labels_)

8793

In [8]:
kmeans.labels_.shape

(8793,)

In [9]:
X['label'] = kmeans.labels_

In [13]:
## Sanity check for how many of the imputed values are correct
## which were valid in the original dataset
temp = X[~X['rapid_int'].isna()]
accuracy_score(list(temp['rapid_int']), list(temp['label']))

0.9279547062986554

In [14]:
X.shape

(8793, 11)

In [15]:
X.drop('rapid_int', axis=1, inplace=True)

In [17]:
label_indices = X.index[X['i'] - X['i'].shift(1) == 1] 
data_indices = label_indices - 1

X_ = X.loc[data_indices]
y = X.loc[label_indices, 'label']

In [18]:
X_.shape, y.shape

((7708, 10), (7708,))

## Logistic Regression

In [19]:
clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=10000)

In [20]:
scores = cross_val_score(clf, X_, y, cv=10, scoring='f1')
print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1 Score: 0.85 (+/- 0.09)


## Random Forest

In [32]:
clf = RandomForestClassifier(n_estimators=600, max_depth=14, random_state=0)

In [33]:
scores = cross_val_score(clf, X_, y, cv=10, scoring='f1')
print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1 Score: 0.73 (+/- 0.11)
