### Introduction
This notebook aims to identify what, if any, correlations exist between an individual's music taste and their self-reported mental health. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from shapely.geometry import Point
import geopandas as gpd

# Load the dataset
df = pd.read_csv('/Users/heatheradler/Documents/GitHub/Springboard/Springboard_Projects/Data Storytelling/Meteorite_Landings.csv')
df.head()

### Data Exploration
We start by exploring the dataset to understand its structure and clean it if necessary. 

In [None]:
df.rename({"mass (g)": "mass_in_grams", "reclat": "latitude", "reclong": "longitude"}, axis=1, inplace=True)

In [None]:
# Check for missing values and data types
df.info()

In [None]:
df.shape

### Data Cleaning
Next, we convert remove missing values in column 'mass (g)' and repalce remaiing null values so that there are no longer any missing vlaues.

In [None]:
df_1 = df.dropna(subset=['mass_in_grams','latitude', 'longitude'])

df_1.fillna(0, inplace=True)

In [None]:
# Check all columns to confirm no null values
df_1.isnull().sum()

### Data Analysis
We'll perform various analyses to explore the dataset more deeply.

In [None]:
# Load shapefile data
world = gpd.read_file(r'/Users/heatheradler/Documents/GitHub/Springboard/Springboard_Projects/Data Storytelling/ne_10m_admin_0_countries.shp')

# Create Point geometries from latitude and longitude
geometry = [Point(lon, lat) for lon, lat in zip(df['longitude'], df['latitude'])]

# Create a GeoDataFrame from the DataFrame and Point geometries
gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

# Load a GeoDataFrame containing country boundaries
# Example: loading a world countries shapefile
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Perform a spatial join to get the country name for each point
joined = gpd.sjoin(gdf_points, world[['geometry', 'iso_a3']], op='within')

# Extract the country names from the joined GeoDataFrame
country_names = joined['iso_a3']

# Add the country names back to the original DataFrame
df_1['country'] = country_names

In [None]:
merged_df = df_1.merge(world[['iso_a3', 'name']], how='left', left_on='country', right_on='iso_a3')
    
# Drop the 'iso_a3' column
merged_df = merged_df.drop(columns=['iso_a3'])
    
# Rename the 'name' column to 'country_name' for clarity
merged_df = merged_df.rename(columns={'name_y': 'country_name'})

In [None]:
merged_df.head()

In [None]:
merged_df.columns

In [None]:
import folium

# Filter out rows with NaN values in lat, long, and mass_in_grams columns
df1_filtered = merged_df[[ 'country_name', 'latitude', 'longitude', 'mass_in_grams']].dropna(subset=[ 'country_name', 'latitude', 'longitude', 'mass_in_grams']).copy()

f = folium.Figure(width=1000, height=500)
m = folium.Map(location=df1_filtered[[ 'latitude', 'longitude']].mean().values.tolist()).add_to(f)

for index, row in df1_filtered.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=1,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        popup=f'Name: {row["country_name"]}\nMass: {row["mass_in_grams"]}g'
        ).add_to(m)

m.fit_bounds([df1_filtered[[ 'latitude', 'longitude']].min().values.tolist(), df1_filtered[[ 'latitude', 'longitude']].max().values.tolist()])

f

# Questions to Answer with Analytics:

### 1) What are the top 10 countries with the highest number of recorded meteorite landings?

In [None]:
df1_country_count = merged_df.groupby(['country_name'])[['country']].count().rename(columns={
    "country": "meteorite_landings"
    }).sort_values(by=['meteorite_landings'], ascending=False).reset_index()
df1_country_count.head(10)

In [None]:
import plotly.express as px 

fig = px.bar(
    df1_country_count.head(10).sort_values(by=['meteorite_landings'], ascending=True),
    x='meteorite_landings',
    y='country_name',
    orientation='h',  # horizontal bar chart
    title="Meteorite Landings by Country",
    labels={'meteorite_landings': 'Meteorite Landings Count', 'country_name': 'Country'}
)

fig.show()

### 2) What are the top 10 meteorite landings worldwide based on their mass?

In [None]:
df1_by_mass = merged_df.sort_values(by=['mass_in_grams'], ascending=False).reset_index(drop=True).copy()
df1_by_mass = df1_by_mass.dropna(subset=["country_name"]).head(10)
df1_by_mass['mass_in_grams'] = df1_by_mass['mass_in_grams'].astype(int)
df1_by_mass.head(10)

In [None]:
fig = px.scatter(df1_by_mass, y="country_name", x="year", color="country", size='mass_in_grams', width=1000, height=400)
fig.update_layout(
    title="<b>Scatter Plot of Top 10 Heaviest Meteorite Landings</b>",
    xaxis_title="<b>Year of Meteorite Landings</b>",
    yaxis_title="<b>Country</b>",
    legend_title="Country"
    )
fig.show()

#### Observations:
(1) The heaviest meteor landing was found in Nambia (1920).             
(2) Out of the top 10 heaviest meteorite landings, Namibia and Mexico recorded 2 each.    
(3) Out of the top 10 heaviest meteorite landings the oldest recorded was in Argentina (1575).

### 3) What is the cumulative count of meteorite landings per year over time

In [None]:
fig = px.bar(merged_df['year'].value_counts().sort_index().reset_index(), x='year', y='count', width=1000, height=400)
fig.update_layout(
    title="<b>Bar Plot of Yearly Meteorite Landings</b>",
    xaxis_range=[ 1950, 2023 ],
    xaxis_title="<b>Year of Meteorite Landings</b>",
    yaxis_title="<b>Total Meteorite Landings</b>"
    )
fig.update_traces(marker_color='green')
fig.show()

In [None]:
df1_by_mass.head()

### 4) What is the distribution of meteorite landings based on their fall/found types?

In [None]:
fig = px.pie(merged_df['fall'].value_counts().reset_index(), values='count', names='fall', width=400, height=400)
fig.update_layout(title="<b>Meteorite Landings By Fall Type</b>", )
fig.show()

### 5) What is the distribution of meteorite landings based on their class type?

In [None]:
df1_class = merged_df['recclass'].value_counts().reset_index()

total_count = df1_class['count'].sum()
df1_class['percentage'] = round((df1_class['count'] / total_count) * 100, 0)

threshold_percentage = 5

filtered_df = df1_class[df1_class['percentage'] >= threshold_percentage]

fig = px.pie(filtered_df, values='percentage', names='recclass', width=600, height=600)
fig.update_layout(title="<b>Meteorite Landings By Class Type</b>", )
fig.update_traces(textposition='inside', texttemplate='%{label}<br>%{value}%')    
fig.show()

### 6) What are the top 10 meteorite classes based on their average mass within the class and their class count?

In [None]:
df1_class = merged_df[merged_df['mass_in_grams'] > 0].copy()
df1_class = merged_df.groupby('recclass')['mass_in_grams'].agg([ 'mean', 'count']).reset_index()
df1_class = df1_class.rename({'mean': 'mass_in_grams'}, axis='columns')
df1_class['mass_in_kg'] = round(df1_class['mass_in_grams'] * 0.001, 1)
df1_class = df1_class.sort_values(by='count', ascending=False).reset_index(drop=True)
df1_class.head(10)

In [None]:
import plotly.graph_objects as go

bar_trace = go.Bar(x=df1_class.head(10)['recclass'], y=df1_class.head(10)['mass_in_kg'], name='Average Mass of Meteorite (kg)')
scatter_trace = go.Scatter(
    x=df1_class.head(10)['recclass'], y=df1_class.head(10)['count'], mode='markers+lines', name='Meteorite Class Count', yaxis='y2'
    )
fig = go.Figure(data=[ bar_trace, scatter_trace ])
fig.update_layout(
    xaxis=dict(title='<b>Meteorite Class</b>'),
    yaxis=dict(title='<b>Average Mass of Meteorite (kg)</b>'),
    yaxis2=dict(title='<b>Meteorite Class Count</b>', overlaying='y', side='right'),
    title="<b>Bar Plot of Average Meteorite Mass (in kg) by Class, Sorted by Class Count</b>",
    legend=dict(orientation='h', yanchor='top', y=1.15, xanchor='right', x=1),
    width=1000,
    height=400
    )
fig.show()