In [5]:
# import "USA_meteorite_data_binned_bins100_interpolated.csv"
# and "USA_meteorite_data_binned_bins100.csv"

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# import train test split
from sklearn.model_selection import train_test_split

# import models for predicting count
from sklearn.ensemble       import RandomForestRegressor
from sklearn.linear_model   import LinearRegression
from sklearn.tree           import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors      import KNeighborsRegressor
from sklearn.metrics        import mean_squared_error, r2_score

# Read in the data ('../../data/USA_meteorite_data_binned_bins100_interpolated.csv', index=False)
df_interpolated = pd.read_csv('../../data/USA_meteorite_data_binned_bins100_interpolated.csv')
df_uninterpolated = pd.read_csv('../../data/USA_meteorite_data_binned_bins100.csv')
df_gaussian = pd.read_csv('../../data/USA_meteorite_data_binned_bins100_guassian.csv')

# make training and testing data using train_test_split

# interpolate data
X_interpolated = df_interpolated.drop(['count'], axis=1)
y_interpolated = df_interpolated['count']
X_train_interpolated, X_test_interpolated, y_train_interpolated, y_test_interpolated = train_test_split(X_interpolated, y_interpolated, test_size=0.2, random_state=42)

# uninterpolate data
X_uninterpolated = df_uninterpolated.drop(['count'], axis=1)
y_uninterpolated = df_uninterpolated['count']
X_train_uninterpolated, X_test_uninterpolated, y_train_uninterpolated, y_test_uninterpolated = train_test_split(X_uninterpolated, y_uninterpolated, test_size=0.2, random_state=42)

# gaussian data
X_gaussian = df_gaussian.drop(['count'], axis=1)
y_gaussian = df_gaussian['count']
X_train_gaussian, X_test_gaussian, y_train_gaussian, y_test_gaussian = train_test_split(X_gaussian, y_gaussian, test_size=0.2, random_state=42)


# create a dictionary of models
models_interpolated = {
    'RandomForestRegressor' : RandomForestRegressor(),
    'LinearRegression'      : LinearRegression(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'MLPRegressor'          : MLPRegressor(),
    'KNeighborsRegressor'   : KNeighborsRegressor()
}

models_uninterpolated = {
    'RandomForestRegressor' : RandomForestRegressor(),
    'LinearRegression'      : LinearRegression(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'MLPRegressor'          : MLPRegressor(),
    'KNeighborsRegressor'   : KNeighborsRegressor()
}

models_gaussian = {
    'RandomForestRegressor' : RandomForestRegressor(),
    'LinearRegression'      : LinearRegression(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'MLPRegressor'          : MLPRegressor(),
    'KNeighborsRegressor'   : KNeighborsRegressor()
}

# create a function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models: a dict of different Scikit-Learn machine learning models
    X_train: training data (no labels)
    X_test: testing data (no labels)
    y_train: training labels
    y_test: test labels
    """
    # set random seed
    np.random.seed(42)
    # make a dictionary to keep model scores
    model_scores = {}
    # loop through models
    for name, model in models.items():
        # fit the model to the data
        model.fit(X_train, y_train)
        # evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

# fit and score models
model_scores_interpolated = fit_and_score(models_interpolated, X_train_interpolated, X_test_interpolated, y_train_interpolated, y_test_interpolated)
model_scores_uninterpolated = fit_and_score(models_uninterpolated, X_train_uninterpolated, X_test_uninterpolated, y_train_uninterpolated, y_test_uninterpolated)
model_scores_gaussian = fit_and_score(models_gaussian, X_train_gaussian, X_test_gaussian, y_train_gaussian, y_test_gaussian)

# graph the model scores together for comparison
fig = go.Figure(data=[
    go.Bar(name='Interpolated', x=list(model_scores_interpolated.keys()), y=list(model_scores_interpolated.values())), 
    go.Bar(name='Gaussian', x=list(model_scores_gaussian.keys()), y=list(model_scores_gaussian.values())),
    go.Bar(name='Uninterpolated', x=list(model_scores_uninterpolated.keys()), y=list(model_scores_uninterpolated.values()))
])
# Change the bar mode
fig.update_layout(
    barmode='group',
    title='Model Scores for Interpolated and Uninterpolated Data',
    xaxis_title='Model',
    yaxis_title='Score',
    legend_title='Data'
)
fig.show()


FileNotFoundError: [Errno 2] No such file or directory: '../../data/USA_meteorite_data_binned_bins100_gaussian.csv'