In [1]:
# ensure our graphs are displayed inline
%matplotlib inline

In [12]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import folium
from folium import plugins
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster

In [3]:
# useful to define where we'll be storing our data
data_directory = "data/"

# useful to define where we'll be storing our output
output_directory = "output/"

## Data Acquisition

First we'll create a `Pandas.DataFrame` out of a `json` file hosted by NASA.

In [5]:
# Data from NASA on meteorite landings
df = pd.read_json("https://data.nasa.gov/resource/y77d-th95.json", )

Now we'll simply do some high level overview of the data.

## Initial Data High Level View

I like to always start out by looking at the thirty thousand foot view of any data set.

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

We see twleve columns:
* five floats
* six strings or mixed data
* one int64

Additionally, the `geolocation` column is JSON, which is something I've never worked with inside of a Pandas DataFrame. Also, we may be able to leverage Pandas' DateTime `dtype` for the `year` column.

## Removing Redundant Data

As `geolocation`'s data is already represented in `reclat` and `reclong`, we'll simply remove it. We're specifically picking this column as its a more complex JSON data type, instead of already separated columns.

In [6]:
df.drop(labels="geolocation", axis=1, inplace=True)

## `NaN` Inspection

Lets look at all columns that have atleast one `NaN` value.

In [None]:
nan_columns = df.columns[df.isna().any()].tolist()
nan_columns

We see that seven of the tweleve columns have atleast one `NaN` value. Lets look into how many `NaN` values are in each column so we can get an idea on how to proceed with cleaning.

In [None]:
nan_column_counts = {}

for nan_column in nan_columns:
    nan_column_counts[nan_column] = sum(pd.isnull(df[nan_column]))
    
nan_column_counts

We see here that number of `NaN` values ranges from as high as 867, to as low as 1. We recall that there are 1000 rows in this data set, so that means most of the rows have `:@computed_region_cbhk_fwbd` and `:@computed_region_nnqa_25f4` as an `NaN` value.

We'll have to handle these after performing some more data inspection.

## Unique Values Inspection

We'll now look at the unique values.

_The following cell has been made a raw cell to avoid its large output from printing._

## `NaN` Handling

Since we're not building any specific model, we're going to leave the `NaN` values as they are. I just want to note that usually you'll have to handle the `NaN` values in a data set, or at the very least, be aware that they exist. There are many techniques for handling `NaN` values, but they won't be disucssed here.

## Geospatial Visualizations

Now we're going to work on creating geospatial visualizations for our data set. These can be incredibly helpful for exploring your data, as well as when it comes time to present or share your work.

These visualizations can be handy as they can help you quickly answer questions. For example, currently we don't know how many meteorites land in the oceans. We'd expect that many to, infact probably more often than land, but we don't have an easy way to determine this. Once we have our visualizations created, we can quickly answer this question.

### Data Preparation

First, we'll need to prepare a dataframe of our latitude and longitude values

In [7]:
# Create a new dataframe of just the lat and long columns
geo_df = df.dropna(axis=0, how="any", subset=['reclat', 'reclong'])
geo_df = geo_df.set_index("id") # we'll preserve the id from the data set

In [8]:
geo_df.head()

Unnamed: 0_level_0,:@computed_region_cbhk_fwbd,:@computed_region_nnqa_25f4,fall,mass,name,nametype,recclass,reclat,reclong,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,Fell,21.0,Aachen,Valid,L5,50.775,6.08333,1880-01-01T00:00:00.000
2,,,Fell,720.0,Aarhus,Valid,H6,56.18333,10.23333,1951-01-01T00:00:00.000
6,,,Fell,107000.0,Abee,Valid,EH4,54.21667,-113.0,1952-01-01T00:00:00.000
10,,,Fell,1914.0,Acapulco,Valid,Acapulcoite,16.88333,-99.9,1976-01-01T00:00:00.000
370,,,Fell,780.0,Achiras,Valid,L6,-33.16667,-64.95,1902-01-01T00:00:00.000


### Creation of the Visualizations

Everything looks good.

Now we'll create our visualizations. First lets make one with every row as a single marker. This may be overkill.

In [None]:
markers_map = folium.Map(zoom_start=6, tiles="CartoDB dark_matter")

for coord in [tuple(x) for x in geo_df.to_records(index=False)]:
    latitude = coord[7]
    longitude = coord[8]
    mass = coord[3]
    name = coord[4]
    rec_class = coord[6]
    index = geo_df[(geo_df["reclat"] == latitude) & (geo_df["reclong"] == longitude)].index.tolist()[0]    
    
    html = f"""
    <table border="1">
        <tr>
            <th> Index </th>
            <th> Latitude </th>
            <th> Longitude </th>
            <th> Mass </th>
            <th> Name </th>
            <th> Recclass </th>
        </tr>
        <tr> 
            <td> {index} </td> 
            <td> {latitude} </td> 
            <td> {longitude} </td> 
            <td> {mass} </td>
            <td> {name} </td>
            <td> {rec_class} </td>
        </tr>
    </table>"""
    iframe = folium.IFrame(html=html, width=375, height=125)
    popup = folium.Popup(iframe, max_width=375)
    
    folium.Marker(location=[latitude, longitude], popup=popup).add_to(markers_map)

markers_map.save(output_directory + "markers_map.html")
markers_map

After seeing the visualization, I don't believe showing a single marker for every row is a good idea, as we have so much data that zooming out pretty far makes it difficult to understand what we're looking at. 

Lets cluster nearby rows to improve readability.

In [None]:
clusters_map = folium.Map(zoom_start=6, tiles="CartoDB dark_matter")

clusters_map_cluster = MarkerCluster().add_to(clusters_map)

for coord in [tuple(x) for x in geo_df.to_records(index=False)]:
    latitude = coord[7]
    longitude = coord[8]
    mass = coord[3]
    name = coord[4]
    rec_class = coord[6]
    index = geo_df[(geo_df["reclat"] == latitude) & (geo_df["reclong"] == longitude)].index.tolist()[0]    
    
    html = f"""
    <table border="1">
        <tr>
            <th> Index </th>
            <th> Latitude </th>
            <th> Longitude </th>
            <th> Mass </th>
            <th> Name </th>
            <th> Recclass </th>
        </tr>
        <tr> 
            <td> {index} </td> 
            <td> {latitude} </td> 
            <td> {longitude} </td> 
            <td> {mass} </td>
            <td> {name} </td>
            <td> {rec_class} </td>
        </tr>
    </table>"""
    iframe = folium.IFrame(html=html, width=375, height=125)
    popup = folium.Popup(iframe, max_width=375)
    
    folium.Marker(location=[latitude, longitude], popup=popup).add_to(clusters_map_cluster)

clusters_map.save(output_directory + "clusters_map.html")
clusters_map

This looks much better.

Just for kicks, lets make a heat map as well!

In [None]:
heat_map = folium.Map(location=[51.5074, 0.1278], zoom_start = 3, tiles="CartoDB dark_matter") 

# Ensure you're handing it floats
geo_df['latitude'] = geo_df["reclat"].astype(float)
geo_df['longitude'] = geo_df["reclong"].astype(float)

# Filter the DF for rows, then columns, then remove NaNs
heat_df = geo_df[['latitude', 'longitude']]
heat_df = heat_df.dropna(axis=0, subset=['latitude','longitude'])

# List comprehension to make out list of lists
heat_data = [[row['latitude'],row['longitude']] for index, row in heat_df.iterrows()]

# Plot it on the map
HeatMap(heat_data).add_to(heat_map)

# Display the map
heat_map.save(output_directory + "heat_map.html")
heat_map

We see here that most of the meteorites land on land. My prediction is that meteorites do infact land in water, probably more often than land due to water's higher proportion on Earth, but all the meteorites must be reported by humans, which explains all of the data points existing on land.

I'd go as far to sar that higher populated areas are more likely to report meteorites, as well as non first world countries.

## Cencus Data

Now we'll explore the notion that higher populated areas are more likely to report meteorites, visually. We'll do that by combing population data gathered from a census.

Lets start by redoing our first graphic, where every meteorite got its own marker, and we'll overlay the poplation of the world by country.

In [22]:
markers_census_layered_map = folium.Map(location=[df['reclat'].mean(), df['reclong'].mean()], zoom_start=6, tiles='Mapbox bright')

clusters_map_cluster = MarkerCluster().add_to(markers_census_layered_map)

fg = folium.FeatureGroup(name="Meteorites")

for coord in [tuple(x) for x in geo_df.to_records(index=False)]:
    latitude = coord[7]
    longitude = coord[8]
    mass = coord[3]
    name = coord[4]
    rec_class = coord[6]
    index = geo_df[(geo_df["reclat"] == latitude) & (geo_df["reclong"] == longitude)].index.tolist()[0]    
    
    html = f"""
    <table border="1">
        <tr>
            <th> Index </th>
            <th> Latitude </th>
            <th> Longitude </th>
            <th> Mass </th>
            <th> Name </th>
            <th> Recclass </th>
        </tr>
        <tr> 
            <td> {index} </td> 
            <td> {latitude} </td> 
            <td> {longitude} </td> 
            <td> {mass} </td>
            <td> {name} </td>
            <td> {rec_class} </td>
        </tr>
    </table>"""
    iframe = folium.IFrame(html=html, width=375, height=125)
    popup = folium.Popup(iframe, max_width=375)
    
    fg.add_child(folium.Marker(location=[latitude, longitude], popup=popup))
    
# add our markers to the map
markers_census_layered_map.add_child(fg)

# add the census population outlined and colored countries to our map
world_geojson = os.path.join(data_directory, "world_geojson_from_ogr.json")
world_geojson_data = open(world_geojson, "r", encoding="utf-8")
markers_census_layered_map.add_child(folium.GeoJson(world_geojson_data.read(), name="Population", style_function=lambda x: {"fillColor":"green" if x["properties"]["POP2005"] <= 10000000 else "orange" if 10000000 < x["properties"]["POP2005"] < 20000000 else "red"}))

# add a toggleable menu for all the layers
markers_census_layered_map.add_child(folium.LayerControl())

# save our map as a separate HTML file
markers_census_layered_map.save(outfile=output_directory + "markers_census_layered_map.html")

# display our map inline
markers_census_layered_map