In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
nasa = pd.read_csv('/kaggle/input/nasa-nearest-earth-objects/neo.csv')
nasa.shape

In [None]:
nasa.sample(10)

In [None]:
nasa.info()

In [None]:
# Dropping columns that have only one unique entity.
nasa.drop(columns= ['orbiting_body', 'sentry_object'], inplace= True)

In [None]:
# Checking of there are any duplicated records in the dataset
nasa.duplicated().sum()

In [None]:
# Changing the id column entries from integers to string to prevent them from being described.
nasa['id'] = nasa.id.astype(str)
nasa.describe()

In [None]:
# Determining the number of unique values in each column in the dataset.
nasa.nunique()

From the code above, it's clear that most of the asteroids orbiting the earth are of the same size in both maximum and minimum estimated diameter.

In [None]:
# Creating a dataframe of the duplicated id entries.
df = nasa[nasa.id.duplicated() == True]

In [None]:
# Determining number of unique entries in the duplicated id entries.
df.groupby('id').nunique()

From the table above, it can be seen that the only changing variables for ell the duplicated id entries were the asteroid's relative velocity and their miss distance.

In [None]:
# Creating an year column and extracting the year from the name column.
nasa['year'] = nasa.name.str.split('(', expand= True)[1].str.extract(r'([\d]+(?:[\W]+))', expand= True)
nasa['year'] = nasa.year.str.strip()
nasa.sample(5)

In [None]:
# Determining if there are outliers in the year column.
nasa.year.unique()

From the code above, there are years which are odd as I've identified in the code below. I therefore have to replace them with ```nan``` to remove them.

In [None]:
# Replacing the odd year values in the years column with nan
odd_years = ['911', '6743', '898', '4788', '6344', '924']
nasa['year'] = nasa.year.replace(odd_years, np.nan)
nasa.year.unique()

## Univariate Plots of Numeric Type Data

Since the absolute magnitude and the estimated minimum and maximum diameter are unique for every asteroid, I will drop all the duplicated asteroids while keeping the identity of the asteroid that appears first.

In [None]:
nasa2 = nasa.copy()
nasa2.drop_duplicates(subset= 'id', inplace= True)
nasa2.info()

In [None]:
# Importing the various libraries needed for plotting.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Plotting a Histogram of minimum estimated distance using log scale transformation on the x axis.
x = np.log10(nasa2.est_diameter_min.describe())
bins = 10 ** np.arange(x.min(), x.max()+0.075, 0.075)
plt.hist(data= nasa2, x= 'est_diameter_min', bins= bins)
plt.xlabel('Minimum Distance Estimate')
plt.ylabel('Frequency')
plt.xscale('log')
x_ticks = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
plt.xticks(ticks= x_ticks, labels= x_ticks)
plt.xlim(0.001, 10)
plt.title('Minimum estimated distance against frequency');

In [None]:
# Plotting a Histogram of maximum estimated distance using log scale transformation on the x axis.
x = np.log10(nasa2.est_diameter_max.describe())
bins = 10 ** np.arange(x.min(), x.max()+0.075, 0.075)
plt.hist(data= nasa2, x= 'est_diameter_max', bins= bins)
plt.xlabel('Maximum Distance Estimate')
plt.ylabel('Frequency')
plt.xscale('log')
x_ticks = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
plt.xticks(ticks= x_ticks, labels= x_ticks)
plt.xlim(0.003, 30)
plt.title('Maximum estimated distance against frequency');

In [None]:
# A histogram showing the distribution of relative velocities in the dataset.
bins = np.arange(nasa.relative_velocity.min(), nasa.relative_velocity.max()+2500, 2500)
plt.hist(data= nasa, x= 'relative_velocity', bins= bins)
plt.xlabel('Relative Velocity')
plt.ylabel('Frequency')
plt.title('Relative Velocity against Frequency');

For the plot above, since there are 90836 unique values, I will use the original dataset to see the distribution of the relative velocities in the dataset.
The plot shows that the relative velocity of the asteroids is skewed to the right meaning that the mean and the mode have a significant difference, where the mean is shifted to the right.

In [None]:
#  A histogram showing the distribution of the miss distance in the dataset.
bins = np.arange(nasa.miss_distance.min(), nasa.miss_distance.max()+1000000, 1000000)
plt.hist(data= nasa, x= 'miss_distance', bins= bins)
plt.xlabel('Miss Distance')
plt.ylabel('Frequency')
plt.title('Miss Distance against Frequency');

In [None]:
bins = np.arange(nasa2.absolute_magnitude.min(), nasa2.absolute_magnitude.max()+0.4, 0.4)
plt.hist(data= nasa2, x= 'absolute_magnitude', bins= bins)
plt.xlabel('Absolute Magnitude')
plt.ylabel('Frequency')
plt.title('Absolute Magnitude against Frequency');

In [None]:
# A bar graph showing how hazardous the asteroids could be.
ax = plt.figure(figsize= (8, 5)).subplots()
base_color = sns.color_palette()[0]
sns.countplot(data= nasa2, x= 'hazardous', color= base_color);
ax.spines[['top', 'right']].set_visible(False)

locs, labels = plt.xticks()

nasa2['hazardous'] = nasa2.hazardous.astype(str)
x = nasa2.hazardous.value_counts()
for loc, label in zip(locs, labels):
    count = x[label.get_text()]
    percentage = '{:.2f}%'.format((count * 100) / x.sum())
    plt.text(loc, count, percentage, ha= 'center', va= 'baseline')
    
plt.title('Hazardous Effect against Frequency');

The plot above shows the hazardous effect of the asteroid by percentage. The majority of the asteroid are not hazardous.

## Bivariate Plots

For both bivariate and multivariate plot, I will use the original dataset to see their relationship across the dataset.

In [None]:
# Heatmap showing how the given numeric variables correlate with one another.
numeric_vars = ['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude']
sns.heatmap(data= nasa[numeric_vars].corr(), annot= True, fmt= '.3f', center= 0, cmap= 'vlag_r', linewidths= 0.2)
plt.title('Heatmap Showing the correlationship\nbetween the given Numeric Variables');

In [None]:
# A pairgrid showing the graphical correlation between the numeric variables given above.
g = sns.PairGrid(data= nasa, vars= numeric_vars, hue= 'hazardous')
g.map_diag(plt.hist, bins= 30)
g.map_offdiag(plt.scatter, alpha= 1/5)
plt.legend(loc= 'center right', title= 'Hazardous\nEffect', bbox_to_anchor= (1.75, 1.75));

In [None]:
hazardous = nasa.hazardous == True
hazardous_not = nasa.hazardous == False

In [None]:
plt.figure(figsize= (12, 5))
plt.subplot(1, 2, 1)
x = np.log10(nasa2.est_diameter_max.describe())
bins = 10 ** np.arange(x.min(), x.max()+0.075, 0.075)
nasa.est_diameter_max[hazardous].plot.hist(x= 'est_distance_max', bins= bins, color= 'orange', alpha= 1/2)
nasa.est_diameter_max[hazardous_not].plot.hist(x= 'est_distance_max', bins= bins, color= 'blue', alpha= 1/5)
plt.xscale('log');

plt.subplot(1, 2, 2)
x = np.log10(nasa2.est_diameter_min.describe())
bins = 10 ** np.arange(x.min(), x.max()+0.075, 0.075)
nasa.est_diameter_min[hazardous].plot.hist(x= 'est_distance_min', bins= bins, color= 'orange', alpha= 1/2)
nasa.est_diameter_min[hazardous_not].plot.hist(x= 'est_distance_min', bins= bins, color= 'blue', alpha= 1/5)
plt.xscale('log')

In [None]:
#  A histogram showing the distribution of the miss distance in the dataset.
bins = np.arange(nasa.miss_distance.min(), nasa.miss_distance.max()+1000000, 1000000)
plt.hist(data= nasa[hazardous], x= 'miss_distance', bins= bins, color= 'orange', alpha= 1/2)
nasa.miss_distance[hazardous_not].plot.hist(x= 'miss_distance', bins= bins, color= 'blue', alpha= 1/5)
plt.xlabel('Miss Distance')
plt.ylabel('Frequency')
plt.title('Miss Distance against Frequency');

From the 2 figures above, it\'s evident that most of the asteroids regardless of the size and miss distance, they are not hazardous. The number of those that are hazardous in the plot above does not exceed 55.

In [None]:
"""
Scatter plots showing the relationship between minimum and maximum estimated diameters with relative velocities
over different y limits.
"""
plt.figure(figsize= (12, 8))
plt.subplot(2, 2, 1)
sns.regplot(data= nasa, x= 'relative_velocity', y= 'est_diameter_max', fit_reg= False, scatter_kws= {'alpha' : 1/5})
# On reducing the y-limits.
plt.subplot(2, 2, 3)
sns.regplot(data= nasa, x= 'relative_velocity', y= 'est_diameter_max', fit_reg= False, scatter_kws= {'alpha' : 1/5})
plt.ylim(0, 20);

plt.subplot(2, 2, 2)
sns.regplot(data= nasa, x= 'relative_velocity', y= 'est_diameter_min', fit_reg= False, scatter_kws= {'alpha' : 1/5})
# On reducing the y-limits.
plt.subplot(2, 2, 4)
sns.regplot(data= nasa, x= 'relative_velocity', y= 'est_diameter_min', fit_reg= False, scatter_kws= {'alpha' : 1/5})
plt.ylim(0, 5);

In [None]:
# Boxplots showing the relationships of various numeric variables and the hazardous effect.
fig, ax = plt.subplots(nrows= 3, figsize= (8, 10))
sns.boxplot(data= nasa, x= 'hazardous', y= 'relative_velocity', color= base_color, ax= ax[0]);
sns.boxplot(data= nasa, x= 'hazardous', y= 'miss_distance', color= base_color, ax= ax[1]);
sns.boxplot(data= nasa2, x= 'hazardous', y= 'absolute_magnitude', color= base_color, ax= ax[2]);

## Multivariate Plot

In [None]:
markers = [[False, 'o'], [True, 's']]
plt.figure(figsize= (20, 5))

for cat, marker in markers:
    nasa_cat = nasa[nasa.hazardous == cat]
    sns.regplot(data= nasa_cat, x= 'absolute_magnitude', y= 'relative_velocity', marker= marker, fit_reg= False,
               scatter_kws= {'alpha' : 1/5})

plt.title('Relationship between Absolute Magnitude and Relative Velocity\nwith respect to the Hazardous effects')
plt.legend(['False', 'True'], title= 'Hazardous\nEffect');

In the plot above, all hazardous asteroids had an absolute magnitude below 22.5. However, most of those asteroids had a wide range in the relative velocities given though the majority of them had a relative velocity below 100,000.