In [1]:
# ensure our graphs are displayed inline
%matplotlib inline

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import folium
from folium import plugins
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster

## Data Acquisition

First we'll create a `Pandas.DataFrame` out of a `json` file hosted by NASA.

In [3]:
# Data from NASA on meteorite landings
df = pd.read_json("https://data.nasa.gov/resource/y77d-th95.json", )
df.set_index("id") # we'll preserve the id from the data set

Unnamed: 0_level_0,:@computed_region_cbhk_fwbd,:@computed_region_nnqa_25f4,fall,geolocation,mass,name,nametype,recclass,reclat,reclong,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,,,Fell,"{'type': 'Point', 'coordinates': [6.08333, 50....",21.0,Aachen,Valid,L5,50.77500,6.08333,1880-01-01T00:00:00.000
2,,,Fell,"{'type': 'Point', 'coordinates': [10.23333, 56...",720.0,Aarhus,Valid,H6,56.18333,10.23333,1951-01-01T00:00:00.000
6,,,Fell,"{'type': 'Point', 'coordinates': [-113, 54.216...",107000.0,Abee,Valid,EH4,54.21667,-113.00000,1952-01-01T00:00:00.000
10,,,Fell,"{'type': 'Point', 'coordinates': [-99.9, 16.88...",1914.0,Acapulco,Valid,Acapulcoite,16.88333,-99.90000,1976-01-01T00:00:00.000
370,,,Fell,"{'type': 'Point', 'coordinates': [-64.95, -33....",780.0,Achiras,Valid,L6,-33.16667,-64.95000,1902-01-01T00:00:00.000
379,,,Fell,"{'type': 'Point', 'coordinates': [71.8, 32.1]}",4239.0,Adhi Kot,Valid,EH4,32.10000,71.80000,1919-01-01T00:00:00.000
390,,,Fell,"{'type': 'Point', 'coordinates': [95.16667, 44...",910.0,Adzhi-Bogdo (stone),Valid,LL3-6,44.83333,95.16667,1949-01-01T00:00:00.000
392,,,Fell,"{'type': 'Point', 'coordinates': [0.61667, 44....",30000.0,Agen,Valid,H5,44.21667,0.61667,1814-01-01T00:00:00.000
398,,,Fell,"{'type': 'Point', 'coordinates': [-65.23333, -...",1620.0,Aguada,Valid,L6,-31.60000,-65.23333,1930-01-01T00:00:00.000
417,,,Fell,"{'type': 'Point', 'coordinates': [-64.55, -30....",1440.0,Aguila Blanca,Valid,L,-30.86667,-64.55000,1920-01-01T00:00:00.000


Now we'll simply do some high level overview of the data.

## Initial Data High Level View

I like to always start out by looking at the thirty thousand foot view of any data set.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 12 columns):
:@computed_region_cbhk_fwbd    133 non-null float64
:@computed_region_nnqa_25f4    134 non-null float64
fall                           1000 non-null object
geolocation                    988 non-null object
id                             1000 non-null int64
mass                           972 non-null float64
name                           1000 non-null object
nametype                       1000 non-null object
recclass                       1000 non-null object
reclat                         988 non-null float64
reclong                        988 non-null float64
year                           999 non-null object
dtypes: float64(5), int64(1), object(6)
memory usage: 101.6+ KB


In [5]:
df.describe()

Unnamed: 0,:@computed_region_cbhk_fwbd,:@computed_region_nnqa_25f4,id,mass,reclat,reclong
count,133.0,134.0,1000.0,972.0,988.0,988.0
mean,26.93985,1537.88806,15398.728,50190.2,29.691592,19.151208
std,12.706929,899.826915,10368.70402,753985.7,23.204399,68.644015
min,1.0,10.0,1.0,0.15,-44.11667,-157.86667
25%,17.0,650.25,7770.5,679.5,21.3,-5.195832
50%,24.0,1647.0,12757.5,2870.0,35.916665,17.325
75%,37.0,2234.25,18831.25,10050.0,45.817835,76.004167
max,50.0,3190.0,57168.0,23000000.0,66.34833,174.4


In [6]:
df.head()

Unnamed: 0,:@computed_region_cbhk_fwbd,:@computed_region_nnqa_25f4,fall,geolocation,id,mass,name,nametype,recclass,reclat,reclong,year
0,,,Fell,"{'type': 'Point', 'coordinates': [6.08333, 50....",1,21.0,Aachen,Valid,L5,50.775,6.08333,1880-01-01T00:00:00.000
1,,,Fell,"{'type': 'Point', 'coordinates': [10.23333, 56...",2,720.0,Aarhus,Valid,H6,56.18333,10.23333,1951-01-01T00:00:00.000
2,,,Fell,"{'type': 'Point', 'coordinates': [-113, 54.216...",6,107000.0,Abee,Valid,EH4,54.21667,-113.0,1952-01-01T00:00:00.000
3,,,Fell,"{'type': 'Point', 'coordinates': [-99.9, 16.88...",10,1914.0,Acapulco,Valid,Acapulcoite,16.88333,-99.9,1976-01-01T00:00:00.000
4,,,Fell,"{'type': 'Point', 'coordinates': [-64.95, -33....",370,780.0,Achiras,Valid,L6,-33.16667,-64.95,1902-01-01T00:00:00.000


We see twleve columns:
* five floats
* six strings or mixed data
* one int64

Additionally, the `geolocation` column is JSON, which is something I've never worked with inside of a Pandas DataFrame. Also, we may be able to leverage Pandas' DateTime `dtype` for the `year` column.

## Removing Redundant Data

As `geolocation`'s data is already represented in `reclat` and `reclong`, we'll simply remove it. We're specifically picking this column as its a more complex JSON data type, instead of already separated columns.

In [7]:
df.drop(labels="geolocation", axis=1, inplace=True)

## Setting Correct Types

Let's go through the columns and set the correct types so we an perform better analysis.

_I did not get around to this..._

## `NaN` Inspection

Lets look at all columns that have atleast one `NaN` value.

In [8]:
nan_columns = df.columns[df.isna().any()].tolist()
nan_columns

[':@computed_region_cbhk_fwbd',
 ':@computed_region_nnqa_25f4',
 'mass',
 'reclat',
 'reclong',
 'year']

We see that seven of the tweleve columns have atleast one `NaN` value. Lets look into how many `NaN` values are in each column so we can get an idea on how to proceed with cleaning.

In [9]:
nan_column_counts = {}

for nan_column in nan_columns:
    nan_column_counts[nan_column] = sum(pd.isnull(df[nan_column]))
    
nan_column_counts

{':@computed_region_cbhk_fwbd': 867,
 ':@computed_region_nnqa_25f4': 866,
 'mass': 28,
 'reclat': 12,
 'reclong': 12,
 'year': 1}

We see here that number of `NaN` values ranges from as high as 867, to as low as 1. We recall that there are 1000 rows in this data set, so that means most of the rows have `:@computed_region_cbhk_fwbd` and `:@computed_region_nnqa_25f4` as an `NaN` value.

We'll have to handle these after performing some more data inspection.

## Unique Values Inspection

We'll now look at the unique values

In [10]:
for column in list(df):
    print(f"{column} has {df[column].nunique()} unique values:")
    print(df[column].unique())

:@computed_region_cbhk_fwbd has 38 unique values:
[nan 50. 49. 18. 23. 29. 20. 11. 37. 32. 21. 36. 10. 17. 34.  9. 47. 33.
 48. 15. 39. 31. 41. 19. 16.  1. 40. 13. 35.  7.  4. 45. 38. 22. 30.  8.
  3. 12. 24.]
:@computed_region_nnqa_25f4 has 131 unique values:
[  nan  429. 1723. 2697.  774. 3134.  602. 1989. 2373.  495.  662. 1921.
 2397. 2216. 1285. 1978. 1869. 1072. 2030.  657. 2495. 2164. 3063. 1795.
 2455.  256. 1029. 1448. 2695.  648.  414.  637. 2684. 2007. 2582. 2459.
   67.  877.  356. 2201. 2332. 1426.  244.  103. 2491. 1444. 2115.  462.
   88.  277. 1300. 2439.   70. 1631. 2331.  385.  807. 1785. 2839. 1470.
   99. 2985. 1205. 2025. 1855.  990.  284. 1657. 1293.  525. 3190. 2971.
 2018.  608. 1585. 2171. 2770. 2711. 1327. 1567.  611.  471.  287. 2740.
 2996.   11. 1290. 2957.  636. 2431.  237.  419. 2615. 1667. 1683. 2238.
   10. 1252. 1078. 1947.  503. 2122. 1023. 1863. 1994. 2185. 3062. 2017.
 1255.  207. 1987. 2566. 1391. 2388.  569. 2885. 2764.  150.  361. 2409.
 1174. 21

## `NaN` Handling

_I did not get around to this_

## Geolocation Visualization

First, we'll need to prepare a dataframe of our latitude and longitude values

In [11]:
# Keep only non NaN values, essentially removing NaN values
non_nan_df = df[np.isfinite(df['reclat'])]
non_nan_df = df[np.isfinite(df['reclong'])]

# Create a new dataframe of just the lat and long columns
geo_df = non_nan_df[['reclat', 'reclong']].copy()

Now we'll create our visualizations. First lets make one with every row as a single marker. This may be overkill.

In [12]:
geo_df.head()

Unnamed: 0,reclat,reclong
0,50.775,6.08333
1,56.18333,10.23333
2,54.21667,-113.0
3,16.88333,-99.9
4,-33.16667,-64.95


In [17]:
markers_map = folium.Map(zoom_start=6, tiles="CartoDB dark_matter")

for coord in [tuple(x) for x in geo_df.to_records(index=False)]:
    latitude = coord[0]
    longitude = coord[1]
    index = geo_df[(geo_df["latitude"] == latitude) & (geo_df["longitude"] == longitude)].index.tolist()[0]    
    
    html = f"""
    <table border="1">
        <tr>
            <th> Index </th>
            <th> Latitude </th>
            <th> Longitude </th>
        </tr>
        <tr> 
            <td> {index} </td> 
            <td> {latitude} </td> 
            <td> {longitude} </td> 
        </tr>
    </table>"""
    iframe = folium.IFrame(html=html, width=450, height=150)
    popup = folium.Popup(iframe, max_width=450)
    
    folium.Marker(location=[latitude, longitude], popup=popup).add_to(markers_map)

markers_map.save("markers_map.html")
markers_map

After seeing the visualization, I don't believe showing a single marker for every row is a good idea. Lets cluster nearby rows.

In [14]:
clusters_map = folium.Map(zoom_start=6, tiles="CartoDB dark_matter")

clusters_map_cluster = MarkerCluster().add_to(clusters_map)

for coord in [tuple(x) for x in geo_df.to_records(index=False)]:
    latitude = coord[0]
    longitude = coord[1]
    folium.Marker(location=[latitude, longitude]).add_to(clusters_map_cluster)

clusters_map.save("clusters_map.html")
clusters_map

This looks much better.

Just for kicks, lets make a heat map as well!

In [15]:
heat_map = folium.Map(location=[51.5074, 0.1278], zoom_start = 3, tiles="CartoDB dark_matter") 

# Ensure you're handing it floats
geo_df['latitude'] = geo_df["reclat"].astype(float)
geo_df['longitude'] = geo_df["reclong"].astype(float)

# Filter the DF for rows, then columns, then remove NaNs
heat_df = geo_df[['latitude', 'longitude']]
heat_df = heat_df.dropna(axis=0, subset=['latitude','longitude'])

# List comprehension to make out list of lists
heat_data = [[row['latitude'],row['longitude']] for index, row in heat_df.iterrows()]

# Plot it on the map
HeatMap(heat_data).add_to(heat_map)

# Display the map
heat_map.save("heat_map.html")
heat_map

We see here that most of the meteorites land on land. My prediction is that meteorites do infact land in water, probably more often than land due to water's higher proportion on Earth, but all the meteorites must be reported by humans, which explains all of the data points existing on land.

I'd go as far to sar that higher populated areas are more likely to report meteorites, as well as non first world countries.