In [287]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from vincenty import vincenty

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 10000)

In [288]:
#Import natural disaster data
raw = pd.read_csv('natural_disaster_human_mobility.csv')

In [289]:
#Switch latitude and longtitude columns, rename and reorder, dataframe
df = raw
df = df.rename(columns = {'disaster.event': 'event', 'user.anon': 'user', 'latitude': 'long', 'longitude.anon': 'lat'})
df = df[['event', 'user', 'lat', 'long', 'time']]
df['time'] = pd.DatetimeIndex(df['time'])


In [290]:
#Clean event names
event_name={
'01_Wipha':'Wipha',
'02_Halong': 'Halong',
'06_Kalmaegi': 'Kalmaegi',
'08_Rammasun_Manila': 'Manila',
'12_Bohol': 'Bohol',
'12_Iquique': 'Iquidue',
'13_Napa': 'Napa',
'21_Norfolk': 'Norfolk',
'22_Hamburg': 'Hamburg',
'23_Atlanta': 'Atlanta',
'31_Phoenix': 'Phoenix',
'32_Detroit': 'Detroit',
'33_Baltimore': 'Baltimore',
'41_AuFire1': 'Australia1',
'42_AuFire2': 'Australia2'}

df['event_name'] = df['event'].map(event_name)

In [291]:
#Add event type
event_type={
'Wipha': 'Typhoon',
'Halong': 'Typhoon',
'Kalmaegi': 'Typhoon',
'Manila': 'Typhoon',
'Bohol': 'Earthquake',
'Iquidue': 'Earthquake',
'Napa': 'Earthquake',
'Norfolk': 'Winter_Storm',
'Hamburg': 'Winter_Storm',
'Atlanta': 'Winter_Storm',
'Phoenix': 'Thunderstorm',
'Detroit': 'Thunderstorm',
'Baltimore': 'Thunderstorm',
'Australia1': 'Wildfire',
'Australia2': 'Wildfire'}    

df['event_type'] = df['event_name'].map(event_type)
df = df.drop(columns = ['event'])

In [292]:
#Add date column, drop time column
df['date'] = df['time'].dt.date
df['date'] = pd.to_datetime(df['date'])
df = df.drop(columns = ['time'])
df.head(2)

Unnamed: 0,user,lat,long,event_name,event_type,date
0,0,24.515364,139.742561,Wipha,Typhoon,2013-10-20
1,0,24.569996,139.70288,Wipha,Typhoon,2013-10-20


In [None]:
#Add natural disaster tag
'Wipha' and '2013-10-11', '2013-10-12', '2013-10-13', '2013-10-14', '2013-10-15', '2013-10-16'
#'Halong'
#'Kalmaegi'
#'Manila' and '2014-7-11', '2014-7-12', '2014-7-13', '2014-7-14', '2014-7-15', '2014-7-16', '2014-7-17', '2014-7-18'
'Bohol' and '2013-10-15'
'Iquidue'and '2014-04-01'
'Napa' and '2014-08-24
'Norfolk'
'Hamburg'
'Atlanta'
'Phoenix': 
'Detroit': 
'Baltimore': 
'Australia1': 2013-10-17, 2013-10-28
'Australia2': 

    

In [298]:
def disaster_day(event_name, date):
    if event_name == 'Wipha' and str(date) == '2013-10-14 00:00:00':
        return True
    elif event_name == 'Wipha' and str(date) == '2013-10-04 00:00:00':
        return True
    elif event_name == 'Bohol' and str(date) == '2013-10-15 00:00:00':
        return True
    else:
        return False

In [294]:
sam.iloc[4,3], str(sam.iloc[4,5])

('Wipha', '2013-10-04 00:00:00')

In [295]:
disaster_day(sam.iloc[4,3], str(sam.iloc[4,5]))

True

In [177]:
sam = df.iloc[0:50,]
#sam

In [299]:
df['disaster_day'] = df.apply(lambda row: disaster_day((row['event_name']), (row['date'])), axis = 1)

In [300]:
df.head(100)

Unnamed: 0,user,lat,long,event_name,event_type,date,disaster_day
0,0,24.515364,139.742561,Wipha,Typhoon,2013-10-20,False
1,0,24.569996,139.70288,Wipha,Typhoon,2013-10-20,False
2,0,24.550507,139.701482,Wipha,Typhoon,2013-10-20,False
3,0,24.508685,139.728029,Wipha,Typhoon,2013-10-20,False
4,0,24.576521,139.702315,Wipha,Typhoon,2013-10-04,True
5,0,24.585992,139.670229,Wipha,Typhoon,2013-10-04,True
6,0,24.586646,139.773957,Wipha,Typhoon,2013-10-06,False
7,0,24.587096,139.773687,Wipha,Typhoon,2013-10-06,False
8,0,24.588075,139.766096,Wipha,Typhoon,2013-10-06,False
9,0,24.588359,139.76488,Wipha,Typhoon,2013-10-06,False


In [316]:
bohol2 = df[(df['event_name'] == 'Australia2')]
bohol2.describe()

Unnamed: 0,user,lat,long
count,64370.0,64370.0,64370.0
mean,3991.742691,-31.948435,151.008412
std,2253.930563,0.099613,0.231007
min,1.0,-32.115152,149.529539
25%,2011.0,-32.014988,150.920726
50%,4112.0,-31.974603,151.065357
75%,5923.0,-31.888433,151.166553
max,7845.0,-31.136003,151.199996


In [315]:
bohol1 = df[(df['event_name'] == 'Australia1')]
bohol1.describe()

Unnamed: 0,user,lat,long
count,43275.0,43275.0,43275.0
mean,2572.768018,-22.569533,151.059366
std,1381.956309,0.203378,0.247151
min,1.0,-22.879171,149.54253
25%,1492.0,-22.66144,150.89256
50%,2679.0,-22.627509,151.095968
75%,3774.0,-22.571184,151.269263
max,4829.0,-21.407894,151.499738


In [311]:
df.shape

(4686154, 7)

In [312]:
df.groupby(col = 'event_name').count()

TypeError: You have to supply one of 'by' and 'level'

In [8]:
all_data = df.groupby(['event', 'user'])
all_data = all_data.apply(lambda _all_data: _all_data.sort_values(by=['time']))

In [113]:
#Generate distance column
#all_data['lat_1'] = all_data['lat'].shift()
#all_data['long_1'] = all_data['long'].shift()
#all_data['dist_mi'] = all_data.apply(lambda row: vincenty((row['lat'], row['long']), (row['lat_1'], row['long_1'])), axis=1)

In [None]:
df = df.groupby('id', group_keys=False).apply(lambda x:x.iloc[1:])

In [None]:
all_data['date'] = all_data['time'].dt.date
all_data['time_diff'] = all_data['time'].diff()
all_data['time_diff_hour'] = all_data['time_diff']/np.timedelta64(1, 'h')
all_data['time_diff_day'] = all_data['time_diff']/np.timedelta64(1, 'D')
all_data['velocity_mph'] = all_data['dist_mi']/all_data['time_diff_hour']
all_data['dist_cent'] = napa.apply(lambda row: vincenty((row['lat'], row['long']), \
                                                    (all_data['lat'].mean(), all_data['long'].mean())), axis=1)
                                                    

In [None]:
#Calculate event location by averaging the first lat long entry for each user
all_data.groupby('event').max()

In [None]:
#Export average coordinates to csv for tableau
group.to_csv('ave_coord.csv')

In [None]:
napa2 = df[(df['event'] == '14_Napa')]
napa2.shape
napa2.to_csv('napa2.csv')
napa2.head()

In [None]:
napa = df[(df['event'] == '14_Napa')]

In [None]:
#Generate distance column
napa['lat_1'] = napa['lat'].shift()
napa['long_1'] = napa['long'].shift()
napa['dist_mi'] = napa.apply(lambda row: vincenty((row['lat'], row['long']), (row['lat_1'], row['long_1'])), axis=1)
napa['dist_ft'] = napa['dist_mi']*5260

In [None]:
napa['time_diff'] = napa['time'].diff()
napa['time_diff_hour'] = napa['time_diff']/np.timedelta64(1, 'h')
napa['time_diff_day'] = napa['time_diff']/np.timedelta64(1, 'D')

In [None]:
napa['velocity_mph'] = napa['dist_mi']/napa['time_diff_hour'] 

In [None]:
napa['dist_cent'] = napa.apply(lambda row: vincenty((row['lat'], row['long']), \
                                                    (napa['lat'].mean(), napa['long'].mean())), axis=1)


In [None]:
napa.to_csv('napa.csv')

In [None]:
napa['date'] = napa['time'].dt.date
napa['date'] = pd.to_datetime(napa['date'])

In [None]:
#napa['disaster_day'] = napa.where(napa['date'] == '2014-08-24')
napa['disaster_day'] = napa['date'].map(lambda x: 1 if x== '2014-08-24' else 0)

In [None]:
napa['disaster_day'] = (napa['date'] == '2014-08-24')
#sol['Completed'] = (sol['Installed Status'] == 'Installed').astype(int) 

In [None]:
napa.head(300)

In [None]:
napa.info()

In [None]:
napa_group_user = napa.groupby(['user', 'date', 'disaster_day']).mean()
napa_group_user

In [None]:

#pd.to_datetime(napa['time'],format = '%m/%d/%Y')

In [None]:
napa_group = napa.groupby(by = 'date').mean().reset_index()
napa_group

In [None]:
napa_group.info()

In [None]:
napa_group3 = napa_group.reset_index()
napa_group3

In [None]:
napa_group2 = napa.groupby(by = 'date').size()
napa_group2

In [None]:
sns.scatterplot(x=napa_group2, y=napa_group['velocity_mph'])

In [None]:
sns.scatterplot(x=napa_group3['lat'], y=napa_group3['long'], hue=napa_group3['date'])

In [None]:
sns.scatterplot(x=napa_group3['lat'], y=napa_group3['long'])

In [None]:
X = np.append(napa_group3['lat'],napa_group3['long']).transpose()

In [None]:
X = napa_group3[['lat', 'long']]

In [None]:
X = X.as_matrix()

In [None]:
X.shape

In [None]:
#helper function that allows us to display data in 2 dimensions an highlights the clusters
def display_cluster(X,km=[],num_clusters=0):
    color = 'brgcmyk'
    alpha = 0.5
    s = 20
    if num_clusters == 0:
        plt.scatter(X[:,0],X[:,1],c = color[0],alpha = alpha,s = s)
    else:
        for i in range(num_clusters):
            plt.scatter(X[km.labels_==i,0],X[km.labels_==i,1],c = color[i],alpha = alpha,s=s)
            plt.scatter(km.cluster_centers_[i][0],km.cluster_centers_[i][1],c = color[i], marker = 'x', s = 100)

In [None]:
num_clusters = 2
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=1) # n_init, number of times the K-mean algorithm will run
km.fit(X)
display_cluster(X,km,num_clusters)

In [None]:
km.cluster_centers_

In [None]:
km.inertia_

In [None]:
plt.plot([KMeans(n_clusters=i).fit(X).inertia_ for i in range(1,10)])