#UFO DATA EXPLORATION

In [None]:
from datetime import timedelta
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
%%time
#raw_data = pd.read_csv('./data/complete.csv')
scrubbed_data = pd.read_csv('../raw_data/scrubbed.csv', low_memory=False)

In [None]:
scrubbed_data.head()

In [None]:
import seaborn as sns
plt.style.use('dark_background')
plt.rcParams['font.size'] = 14
plt.figure(figsize=(12,5))
palette = sns.color_palette('Paired', 10)

In [None]:
scrubbed_data['country'].describe()

In [None]:
sns.catplot(x="country", kind="count", palette=palette, data=scrubbed_data, height=5, aspect=3);
sns.despine()
plt.title('Sightings per country');
plt.show()

In [None]:
def extract_time_features(df):
    timezone_name = 'America/New_York'
    time_column = "datetime"
    df['datetime_zero'] = df[time_column].str.replace('24:00', '0:00')
    df['datetime_er'] = pd.to_datetime(df['datetime_zero'], format='%m/%d/%Y %H:%M')
    selrow = df['datetime'].str.contains('24:00')
    df[time_column] = df['datetime_er'] + selrow * timedelta(days=1)
    df.index = pd.to_datetime(df[time_column])
    df["dow"] = df.index.weekday
    df["hour"] = df.index.hour
    df["month"] = df.index.month
    df["year"] = df.index.year
    return df.reset_index(drop=True)

In [None]:
time_df = extract_time_features(scrubbed_data)

In [None]:
sns.catplot(x="hour", kind="count", palette=palette, data=time_df, height=5, aspect=3);
sns.despine()
plt.title('Hour of Day');
plt.show()

In [None]:
sns.catplot(x="year", kind="count", palette=palette, data=time_df, height=10, aspect=2);
sns.despine()
plt.title('Year');
plt.show()

In [None]:
sns.catplot(x="dow", kind="count", palette=palette, data=time_df, height=5, aspect=3);
sns.despine()
plt.title('Day of Week');
plt.show()

In [None]:
!pip install folium
import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime

In [None]:
center_location = 29.8830556, -97.9411111
m = folium.Map(location=center_location, control_scale=True, zoom_start=3)

In [None]:
location_df = scrubbed_data[['latitude',
                             'longitude ']]

In [None]:
location_df['count'] = 1
heatmap_data = location_df.head(10000).groupby(['latitude', 'longitude ']).sum().reset_index().values.tolist()
gradient = {0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}
HeatMap(data=heatmap_data, radius=5, gradient=gradient, max_zoom=13).add_to(m)
m

In [None]:
time_df['count'] = 1
heatmap_data_by_hour = []
__df__ = time_df.head(10000)
for hour in time_df.hour.sort_values().unique():
    _df = __df__[__df__.hour == hour][['longitude ', 'latitude', 'count']].groupby(['latitude', 'longitude ']).sum().reset_index().values.tolist()
    heatmap_data_by_hour.append(_df)

In [None]:
m2 = folium.Map(location=center_location, control_scale=True, zoom_start=11)
HeatMapWithTime(heatmap_data_by_hour, radius=5, 
                gradient=gradient, 
                min_opacity=0.5, max_opacity=0.8, 
                use_local_extrema=False).add_to(m2)
m2

In [None]:
new_data = pd.read_csv('../raw_data/ufo.csv')

In [None]:
new_data.head()

In [None]:
print(len(new_data))
new_data = new_data.drop_duplicates()
len(new_data)

In [None]:
print(len(new_data))
new_data = new_data.dropna()
len(new_data)

In [None]:
new_data['Datetime'] = pd.to_datetime(new_data['Datetime'], errors='coerce')
new_data.dtypes

In [None]:
latest_date = scrubbed_data['datetime'].max()
latest_date

In [None]:
new_data = new_data[new_data['Datetime'] > latest_date]
len(new_data)

In [None]:
new_data.min()

In [None]:
new_data['full_address'] = new_data.City + ',' + new_data.State

In [None]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(timeout=10, user_agent = "myGeolocator")

def geocode_my_address(addr):
  # function that won't crash if it runs into a bad address
  print('geocoding:', addr)
  try:
    x = geolocator.geocode(addr)
    lat_lng = x.latitude, x.longitude
    return lat_lng
  except:
    print("problem with address:", addr)
    lat_lng = None, None
    return lat_lng


In [None]:
new_data['latitude'], new_data['longitude'] = zip(*new_data['full_address'].apply(lambda x: geocode_my_address(x)))


In [None]:
new_data['latitude'] = [g.latitude for g in new_data['geocode']]
new_data['longitude'] = [g.longitude for g in new_data['geocode']]