## Models to Categorize Storm Types (Simplified)
Code written by Drew Dyson and edited by Julia Taussig

The purpose of this notebook is to create models to categorize storms (we were curious about whether we could use storm data to predict categories of storms if NOAA had not categorized the storm yet).

Importing libraries:

In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
import re

from sklearn.model_selection import train_test_split,cross_val_predict,cross_val_score,GridSearchCV
from sklearn.linear_model import LassoCV, LogisticRegressionCV, RidgeCV, BayesianRidge, bayes
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD

In [2]:
hurdat = pd.read_csv('./data/df_hurricanes_FirstIncidenceLorHU_withFatalitites.csv')

In [3]:
hurdat.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,...,High Wind SW,High Wind NW,Year,Month,Day,Hurricane Name,Hurricane Year,Hurricane Month,Total Deaths,US Impact? 1=yes
0,0,AL071969,BLANCHE,19690811,1800.0,,HU,35.5N,69.9W,75.0,...,,,1969,8,11,BLANCHE,1969,8,0,0
1,1,AL081969,DEBBIE,19690816,1800.0,,HU,15.5N,48.0W,65.0,...,,,1969,8,16,DEBBIE,1969,8,0,0
2,2,AL091969,CAMILLE,19690815,2200.0,L,HU,21.9N,84.3W,95.0,...,,,1969,8,15,CAMILLE,1969,8,296,1
3,3,AL131969,FRANCELIA,19690901,1800.0,,HU,17.3N,83.2W,65.0,...,,,1969,9,1,FRANCELIA,1969,9,0,0
4,4,AL161969,GERDA,19690908,1800.0,,HU,32.0N,78.0W,65.0,...,,,1969,9,8,GERDA,1969,9,0,1


In [6]:
hurdat.drop('Unnamed: 0', axis = 1, inplace = True)

In [7]:
hurdat.head()

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,High Wind SW,High Wind NW,Year,Month,Day,Hurricane Name,Hurricane Year,Hurricane Month,Total Deaths,US Impact? 1=yes
0,AL071969,BLANCHE,19690811,1800.0,,HU,35.5N,69.9W,75.0,997.0,...,,,1969,8,11,BLANCHE,1969,8,0,0
1,AL081969,DEBBIE,19690816,1800.0,,HU,15.5N,48.0W,65.0,,...,,,1969,8,16,DEBBIE,1969,8,0,0
2,AL091969,CAMILLE,19690815,2200.0,L,HU,21.9N,84.3W,95.0,,...,,,1969,8,15,CAMILLE,1969,8,296,1
3,AL131969,FRANCELIA,19690901,1800.0,,HU,17.3N,83.2W,65.0,995.0,...,,,1969,9,1,FRANCELIA,1969,9,0,0
4,AL161969,GERDA,19690908,1800.0,,HU,32.0N,78.0W,65.0,,...,,,1969,9,8,GERDA,1969,9,0,1


In [10]:
hurdat['lat'] = np.nan
hurdat['lng'] = np.nan
hurdat['Longitude'] = hurdat['Longitude'].apply(lambda x : re.sub("(W)",'', x))
hurdat['Longitude'] = hurdat['Longitude'].apply(lambda x : re.sub("(E)",'', x))
hurdat['Latitude'] = hurdat['Latitude'].apply(lambda x : re.sub("(N)",'', x))
# Equation to convert lat/lng
# DD = d + (min/60) + (sec/3600)
# reverse indexing to allow for out of range strings (Sam Stack's Idea)

def lngdecimaldegrconv(lng):
    x = round(-(float(lng[:-2]) + ((float(lng[-1]))/60)), 1) 
    return x
#     print(df['lng'])
def latdecimaldegrconv(lat):
    x = round(float(lat[:-2]) + (float(lat[-1]))/60, 1)
    return x
#     print(df['lat'])
hurdat['lat'] = hurdat['Latitude'].apply(latdecimaldegrconv)
hurdat['lng'] = hurdat['Longitude'].apply(lngdecimaldegrconv)

In [11]:
hurdat.head()

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Year,Month,Day,Hurricane Name,Hurricane Year,Hurricane Month,Total Deaths,US Impact? 1=yes,lat,lng
0,AL071969,BLANCHE,19690811,1800.0,,HU,35.5,69.9,75.0,997.0,...,1969,8,11,BLANCHE,1969,8,0,0,35.1,-69.2
1,AL081969,DEBBIE,19690816,1800.0,,HU,15.5,48.0,65.0,,...,1969,8,16,DEBBIE,1969,8,0,0,15.1,-48.0
2,AL091969,CAMILLE,19690815,2200.0,L,HU,21.9,84.3,95.0,,...,1969,8,15,CAMILLE,1969,8,296,1,21.1,-84.0
3,AL131969,FRANCELIA,19690901,1800.0,,HU,17.3,83.2,65.0,995.0,...,1969,9,1,FRANCELIA,1969,9,0,0,17.1,-83.0
4,AL161969,GERDA,19690908,1800.0,,HU,32.0,78.0,65.0,,...,1969,9,8,GERDA,1969,9,0,1,32.0,-78.0


In [14]:
hurdat['Status'].value_counts()

HU    303
Name: Status, dtype: int64