In [22]:
import utils

We load our original JSON dataset and transform it the same way we did for the Cassandra database.

In [23]:
merged_df = utils.prepare_dataset(utils.DATASET_PATH)

We then perform one additional transformation on it to make use of the `location` type specific to MongoDB.

In [24]:
def transform_to_geojson(df):
    """Transform the location for MongoDB."""

    df['location'] = df['coordinates'].apply(lambda coords: {'type': 'Point', 'coordinates': coords[:2]})

    # Delete `coordinates` column
    df.drop('coordinates', axis=1, inplace=True)
    
    return df

df = transform_to_geojson(merged_df)

Now, we notice there are some duplicate values in `magtype` column but encoded differently.

In [25]:
df['magtype'].unique()

<StringArray>
[   'Md',    'Ml',    'mb',    'ml',   'Mww', 'mb_Lg',    'Mb',   'Mwb',
    'Me',    'ML',    'Mw',     'H',  'MbLg',   'Mwp',    'Mt',    <NA>]
Length: 16, dtype: string

So let’s remove the duplicate values from the `magtype` column.

In [26]:
df['magtype'] = df['magtype'].str.lower().str.replace('_', '')
df['magtype'].unique()

<StringArray>
['md', 'ml', 'mb', 'mww', 'mblg', 'mwb', 'me', 'mw', 'h', 'mwp', 'mt', <NA>]
Length: 12, dtype: string

We will also convert the dates into a proper format for `mongoimport`.

In [27]:
def transform_to_mongodbdate(datetime):
    if datetime is not None:
        return {
            "$date": datetime,
        }
    else:
        return None

df['time'] = df['time'].apply(transform_to_mongodbdate)

df['updated'] = df['updated'].apply(transform_to_mongodbdate)

In [28]:
df.to_json('earthquakes_transformed.json', orient='records')

Let’s now count the final number of records. 😊

In [29]:
len(df)

7669