In [22]:
import sqlite3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

def correlation_analysis_and_visualization(db_file='nasa_data.db'):
    conn = sqlite3.connect(db_file)

    # Load data
    neows_data = pd.read_sql('SELECT * FROM neows_data', conn)
    meteorite_data = pd.read_sql('SELECT * FROM meteorite_landings WHERE year IS NOT NULL AND "mass (g)" IS NOT NULL', conn)

    # Example correlation analysis between miss distance and magnitude
    correlation_neows, p_value_neows = pearsonr(neows_data['miss_distance_km'], neows_data['absolute_magnitude_h'])
    print(f'Correlation between miss distance and magnitude: {correlation_neows}, p-value: {p_value_neows}')

    # Visualize the distribution of potentially hazardous asteroids
    plt.figure(figsize=(10, 6))
    sns.countplot(data=neows_data, x='is_potentially_hazardous')
    plt.title('Potentially Hazardous Asteroids')
    plt.xlabel('Is Potentially Hazardous')
    plt.ylabel('Count')
    plt.savefig('potentially_hazardous_asteroids.png')
    plt.clf()

    # Visualize the relationship between miss distance and magnitude
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=neows_data, x='miss_distance_km', y='absolute_magnitude_h')
    plt.title(f'Miss Distance vs Magnitude\nCorrelation: {correlation_neows:.2f}, p-value: {p_value_neows:.2e}')
    plt.xlabel('Miss Distance (km)')
    plt.ylabel('Absolute Magnitude')
    plt.savefig('miss_distance_vs_magnitude.png')
    plt.clf()

    # Frequency distribution of meteorite classes (recclass) - top 10 only
    top_10_classes = meteorite_data['recclass'].value_counts().nlargest(10).index
    top_10_data = meteorite_data[meteorite_data['recclass'].isin(top_10_classes)]

    plt.figure(figsize=(12, 8))
    sns.countplot(data=top_10_data, y='recclass', order=top_10_classes)
    plt.title('Top 10 Most Frequent Meteorite Classes')
    plt.xlabel('Count')
    plt.ylabel('Meteorite Class')
    plt.savefig('top_10_meteorite_class_distribution.png')
    plt.clf()

    # Correlation between mass and geographic points (latitude and longitude)
    if 'reclat' in meteorite_data.columns and 'reclong' in meteorite_data.columns:
        # Correlation and visualization for latitude
        if meteorite_data['reclat'].nunique() > 1 and meteorite_data['mass (g)'].nunique() > 1:
            correlation_lat, p_value_lat = pearsonr(meteorite_data['reclat'], meteorite_data['mass (g)'])
            print(f'Correlation between mass and latitude: {correlation_lat}, p-value: {p_value_lat}')
            
            plt.figure(figsize=(10, 6))
            sns.scatterplot(data=meteorite_data, x='reclat', y='mass (g)')
            plt.title(f'Mass vs Latitude of Meteorite Landings\nCorrelation: {correlation_lat:.2f}, p-value: {p_value_lat:.2e}')
            plt.xlabel('Latitude')
            plt.ylabel('Mass (g)')
            plt.ylim(meteorite_data['mass (g)'].min(), meteorite_data['mass (g)'].max())
            plt.savefig('mass_vs_latitude_meteorite_landings.png')
            plt.clf()
        else:
            print("Not enough variation in data to perform correlation analysis between mass and latitude.")

        # Correlation and visualization for longitude
        if meteorite_data['reclong'].nunique() > 1 and meteorite_data['mass (g)'].nunique() > 1:
            correlation_long, p_value_long = pearsonr(meteorite_data['reclong'], meteorite_data['mass (g)'])
            print(f'Correlation between mass and longitude: {correlation_long}, p-value: {p_value_long}')
            
            plt.figure(figsize=(10, 6))
            sns.scatterplot(data=meteorite_data, x='reclong', y='mass (g)')
            plt.title(f'Mass vs Longitude of Meteorite Landings\nCorrelation: {correlation_long:.2f}, p-value: {p_value_long:.2e}')
            plt.xlabel('Longitude')
            plt.ylabel('Mass (g)')
            plt.ylim(meteorite_data['mass (g)'].min(), meteorite_data['mass (g)'].max())
            plt.savefig('mass_vs_longitude_meteorite_landings.png')
            plt.clf()
        else:
            print("Not enough variation in data to perform correlation analysis between mass and longitude.")
    else:
        print("Columns 'reclat' and/or 'reclong' not found in meteorite_data, skipping geographic correlation analysis.")

    conn.close()

# Usage example
if __name__ == "__main__":
    correlation_analysis_and_visualization()


Correlation between miss distance and magnitude: -0.3064389721210181, p-value: 0.0004584355653810183
Correlation between mass and latitude: 0.02923451641321672, p-value: 1.1397218213609485e-08
Correlation between mass and longitude: -0.021853853462669874, p-value: 1.9819796184217084e-05


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>