# Importing the libraries

This section includes the libraries required for data handling, visualization, and interaction with external APIs.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests

# Extracting the data using API

Here, we use an API endpoint to fetch data iteratively. The process handles pagination by using an offset and limit for each request.


In [2]:
# API endpoint
endpoint = "https://data.nasa.gov/resource/gh4g-9sfh.json"
chunk_size = 1000  # Rows per request
all_data = []      # List to store all rows

offset = 0

#loop to iteratively fetch data until no more is available
while True:
    # Construct API URL with limit and offset
    url = f"{endpoint}?$limit={chunk_size}&$offset={offset}" #Define API endpoint and data retrieval parameters
    response = requests.get(url)
    data = response.json()

    if not data:  # Break if no more data
        break

    all_data.extend(data)  # Append data to the list
    offset += chunk_size   # Increment offset

## Saving the Data to a CSV file

The processed data is saved locally for future analysis and easy accessibility.

In [3]:
# Convert to DataFrame for better handling
df = pd.DataFrame(all_data)
print(df.info())  # Check the structure of the DataFrame

# Save to CSV
df.to_csv("meteorite_landings.csv", index=False)
print("Data saved to meteorite_landings.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   name                         45716 non-null  object
 1   id                           45716 non-null  object
 2   nametype                     45716 non-null  object
 3   recclass                     45716 non-null  object
 4   mass                         45585 non-null  object
 5   fall                         45716 non-null  object
 6   year                         45425 non-null  object
 7   reclat                       38401 non-null  object
 8   reclong                      38401 non-null  object
 9   geolocation                  38401 non-null  object
 10  :@computed_region_cbhk_fwbd  1659 non-null   object
 11  :@computed_region_nnqa_25f4  1659 non-null   object
dtypes: object(12)
memory usage: 4.2+ MB
None
Data saved to meteorite_landings.csv


In [4]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation,:@computed_region_cbhk_fwbd,:@computed_region_nnqa_25f4
0,Aachen,1,Valid,L5,21,Fell,1880-01-01T00:00:00.000,50.775,6.08333,"{'latitude': '50.775', 'longitude': '6.08333'}",,
1,Aarhus,2,Valid,H6,720,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"{'latitude': '56.18333', 'longitude': '10.23333'}",,
2,Abee,6,Valid,EH4,107000,Fell,1952-01-01T00:00:00.000,54.21667,-113.0,"{'latitude': '54.21667', 'longitude': '-113.0'}",,
3,Acapulco,10,Valid,Acapulcoite,1914,Fell,1976-01-01T00:00:00.000,16.88333,-99.9,"{'latitude': '16.88333', 'longitude': '-99.9'}",,
4,Achiras,370,Valid,L6,780,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95,"{'latitude': '-33.16667', 'longitude': '-64.95'}",,


## Dropping the irrelavant columns and the NA rows

In [5]:
# Drop the last two columns by specifying their positions
df = df.drop(df.columns[-2:], axis=1)

In [6]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation
0,Aachen,1,Valid,L5,21,Fell,1880-01-01T00:00:00.000,50.775,6.08333,"{'latitude': '50.775', 'longitude': '6.08333'}"
1,Aarhus,2,Valid,H6,720,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"{'latitude': '56.18333', 'longitude': '10.23333'}"
2,Abee,6,Valid,EH4,107000,Fell,1952-01-01T00:00:00.000,54.21667,-113.0,"{'latitude': '54.21667', 'longitude': '-113.0'}"
3,Acapulco,10,Valid,Acapulcoite,1914,Fell,1976-01-01T00:00:00.000,16.88333,-99.9,"{'latitude': '16.88333', 'longitude': '-99.9'}"
4,Achiras,370,Valid,L6,780,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95,"{'latitude': '-33.16667', 'longitude': '-64.95'}"


In [7]:
# Drop rows with any NaN values
df = df.dropna()

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)


In [8]:
# Check the resulting DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38115 entries, 0 to 38114
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         38115 non-null  object
 1   id           38115 non-null  object
 2   nametype     38115 non-null  object
 3   recclass     38115 non-null  object
 4   mass         38115 non-null  object
 5   fall         38115 non-null  object
 6   year         38115 non-null  object
 7   reclat       38115 non-null  object
 8   reclong      38115 non-null  object
 9   geolocation  38115 non-null  object
dtypes: object(10)
memory usage: 2.9+ MB
None


In [9]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation
0,Aachen,1,Valid,L5,21,Fell,1880-01-01T00:00:00.000,50.775,6.08333,"{'latitude': '50.775', 'longitude': '6.08333'}"
1,Aarhus,2,Valid,H6,720,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"{'latitude': '56.18333', 'longitude': '10.23333'}"
2,Abee,6,Valid,EH4,107000,Fell,1952-01-01T00:00:00.000,54.21667,-113.0,"{'latitude': '54.21667', 'longitude': '-113.0'}"
3,Acapulco,10,Valid,Acapulcoite,1914,Fell,1976-01-01T00:00:00.000,16.88333,-99.9,"{'latitude': '16.88333', 'longitude': '-99.9'}"
4,Achiras,370,Valid,L6,780,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95,"{'latitude': '-33.16667', 'longitude': '-64.95'}"


## year column

In [10]:
# Identify rows where year conversion failed
invalid_years = df[pd.to_datetime(df['year'], errors='coerce').isna()]
print(invalid_years['year'])

167      1583-01-01T00:00:00.000
268      1400-01-01T00:00:00.000
273      1492-01-01T00:00:00.000
354      1628-01-01T00:00:00.000
393      1621-01-01T00:00:00.000
605      1632-01-01T00:00:00.000
634      1637-01-01T00:00:00.000
675      0861-01-01T00:00:00.000
706      1671-01-01T00:00:00.000
818      1491-01-01T00:00:00.000
920      1623-01-01T00:00:00.000
997      1668-01-01T00:00:00.000
5264     1576-01-01T00:00:00.000
22408    1600-01-01T00:00:00.000
Name: year, dtype: object


These dates are invalid because `pandas.to_datetime` struggles with interpreting years that are very old (before 1678 by default). By default, pandas tries to interpret all dates within a reasonable historical range, but it might flag very old years as invalid. Furtunately this is not a problem for us since our analysis requires us to use data from recent history

In [11]:
# Extract the year from the year column
df['year'] = pd.to_datetime(df['year'], errors='coerce').dt.year

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38115 entries, 0 to 38114
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         38115 non-null  object 
 1   id           38115 non-null  object 
 2   nametype     38115 non-null  object 
 3   recclass     38115 non-null  object 
 4   mass         38115 non-null  object 
 5   fall         38115 non-null  object 
 6   year         38101 non-null  float64
 7   reclat       38115 non-null  object 
 8   reclong      38115 non-null  object 
 9   geolocation  38115 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.9+ MB


In [13]:
# Drop rows with any NaN values
df = df.dropna()

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

In [14]:
invalid_dates = df[(df['year'] < 1678) | (df['year'] > 2016)]
invalid_dates

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation
24382,Northwest Africa 7701,57150,Valid,CK6,55,Found,2101.0,0.0,0.0,"{'latitude': '0.0', 'longitude': '0.0'}"


In [15]:
# Drop rows with odd years
df = df[(df['year'] >= 1678) & (df['year'] <= 2016)]


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38100 entries, 0 to 38100
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         38100 non-null  object 
 1   id           38100 non-null  object 
 2   nametype     38100 non-null  object 
 3   recclass     38100 non-null  object 
 4   mass         38100 non-null  object 
 5   fall         38100 non-null  object 
 6   year         38100 non-null  float64
 7   reclat       38100 non-null  object 
 8   reclong      38100 non-null  object 
 9   geolocation  38100 non-null  object 
dtypes: float64(1), object(9)
memory usage: 3.2+ MB


## geolocation, reclat, reclong

In [17]:
# Convert columns to numeric
df['reclat'] = pd.to_numeric(df['reclat'], errors='coerce')
df['reclong'] = pd.to_numeric(df['reclong'], errors='coerce')

In [18]:
# Filter rows with valid latitude and longitude
df = df[df['reclat'].between(-90, 90) & df['reclong'].between(-180, 180)]

In [19]:
# Drop the geolocation column
df.drop(columns=['geolocation'], inplace=True)

In [20]:
# Rename columns
df.rename(columns={
    'reclat': 'latitude',
    'reclong': 'longitude',
    'nametype': 'type',
    'recclass': 'classification'
}, inplace=True)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38100 entries, 0 to 38100
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            38100 non-null  object 
 1   id              38100 non-null  object 
 2   type            38100 non-null  object 
 3   classification  38100 non-null  object 
 4   mass            38100 non-null  object 
 5   fall            38100 non-null  object 
 6   year            38100 non-null  float64
 7   latitude        38100 non-null  float64
 8   longitude       38100 non-null  float64
dtypes: float64(3), object(6)
memory usage: 2.9+ MB


## mass

In [22]:
df['mass'] = pd.to_numeric(df['mass'], errors='coerce')

In [23]:
# Check for potential outliers (optional)
print(df['mass'].describe())

count    3.810000e+04
mean     1.402119e+04
std      5.719672e+05
min      0.000000e+00
25%      6.630000e+00
50%      2.900000e+01
75%      1.868500e+02
max      6.000000e+07
Name: mass, dtype: float64


In [24]:
# Remove rows with invalid mass (e.g., negative or zero mass)
df = df[df['mass'] > 0]

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38081 entries, 0 to 38100
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            38081 non-null  object 
 1   id              38081 non-null  object 
 2   type            38081 non-null  object 
 3   classification  38081 non-null  object 
 4   mass            38081 non-null  float64
 5   fall            38081 non-null  object 
 6   year            38081 non-null  float64
 7   latitude        38081 non-null  float64
 8   longitude       38081 non-null  float64
dtypes: float64(4), object(5)
memory usage: 2.9+ MB


## Fall column

In [26]:
# Count occurrences of each unique value in the 'fall' column
fall_counts = df['fall'].value_counts()

In [27]:
fall_counts

fall
Found    37028
Fell      1053
Name: count, dtype: int64

## id column

In [28]:
# Check if the id column contains unique values
print(df['id'].is_unique)  # Returns True if all IDs are unique, False otherwise

True


## recclass

In [29]:
# Check unique values in the 'recclass' column
print(df['classification'].unique())

['L5' 'H6' 'EH4' 'Acapulcoite' 'L6' 'LL3-6' 'H5' 'L' 'Diogenite-pm' 'H4'
 'H' 'Iron, IVA' 'CR2-an' 'LL5' 'CI1' 'L/LL4' 'Eucrite-mmict' 'CV3'
 'Ureilite-an' 'Stone-uncl' 'L3' 'Angrite' 'LL6' 'L4' 'Aubrite'
 'Iron, IIAB' 'Iron, IAB-sLL' 'Iron, ungrouped' 'CM2' 'Mesosiderite-A1'
 'LL4' 'C2-ung' 'LL3.8' 'Howardite' 'Eucrite-pmict' 'Diogenite' 'LL3.15'
 'LL3.9' 'Iron, IAB-MG' 'H/L3.9' 'Eucrite' 'H4-an' 'L/LL6' 'Iron, IIIAB'
 'OC' 'H/L4' 'H4-5' 'L3.7' 'LL3.4' 'Martian (chassignite)' 'EL6' 'H3.8'
 'H3-5' 'H5-6' 'Mesosiderite' 'H5-7' 'L3-6' 'H4-6' 'Ureilite'
 'Mesosiderite-A3/4' 'CO3.3' 'H3' 'EH3/4-an' 'Iron, IIE' 'L/LL5' 'H3.7'
 'CBa' 'H4/5' 'H3/4' 'H?' 'H3-6' 'L3.4' 'Iron, IAB-sHL' 'L3.7-6'
 'Iron, IID' 'EH7-an' 'CR2' 'CO3.2' 'K3' 'L5/6' 'CK4' 'Iron, IIE-an'
 'L3.6' 'Iron' 'LL3.2' 'CO3.5' 'Lodranite' 'Mesosiderite-A3' 'L3-4' 'H5/6'
 'Pallasite, PMG' 'Eucrite-cm' 'Pallasite' 'L5-6' 'CO3.6'
 'Martian (nakhlite)' 'LL3.6' 'C3-ung' 'H3-4' 'CO3.4' 'EH3'
 'Iron, IAB-ung' 'Winonaite' 'LL' 'Eucrite-b

In [30]:
# Count occurrences of each classification type
classification_counts = df['classification'].value_counts()
print(classification_counts)

classification
L6                7518
H5                6243
H6                3897
H4                3880
L5                3264
                  ... 
H4(?)                1
EL4/5                1
L5-7                 1
H/L3.9               1
Mesosiderite-C       1
Name: count, Length: 420, dtype: int64


## type

In [31]:
# Check unique values in the 'type' column
print(df['type'].unique())


['Valid' 'Relict']


In [32]:
# Count occurrences of each type
type_counts = df['type'].value_counts()
print(type_counts)

type
Valid     38078
Relict        3
Name: count, dtype: int64


In [33]:
# Sort the DataFrame by 'year' in ascending order
df = df.sort_values(by='year', ascending=True)

In [34]:
df.head()

Unnamed: 0,name,id,type,classification,mass,fall,year,latitude,longitude
838,Sasagase,23187,Valid,H,695.0,Fell,1688.0,34.71667,137.78333
843,Schellin,23457,Valid,L,7000.0,Fell,1715.0,53.35,15.05
30539,Siratik,23615,Valid,"Iron, IIAB",1700.0,Found,1716.0,14.0,-11.0
756,Ploschkovitz,18849,Valid,L5,39.0,Fell,1723.0,50.53333,14.11667
30619,Steinbach,23722,Valid,"Iron, IVA-an",98000.0,Found,1724.0,50.5,12.5


In [35]:
# Save DataFrame to CSV file
df.to_csv('cleaned_meteorite_landings.csv', index=False)

In [36]:
# Filter rows where the year is >= 1950
df_filtered = df[df['year']>= 1950]

In [37]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36459 entries, 15658 to 24428
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            36459 non-null  object 
 1   id              36459 non-null  object 
 2   type            36459 non-null  object 
 3   classification  36459 non-null  object 
 4   mass            36459 non-null  float64
 5   fall            36459 non-null  object 
 6   year            36459 non-null  float64
 7   latitude        36459 non-null  float64
 8   longitude       36459 non-null  float64
dtypes: float64(4), object(5)
memory usage: 2.8+ MB


In [38]:
df_filtered.to_csv('cleaned_meteorite_landing_from_1950.csv', index=False)