Add all necessary imports for your script.
sqlite3 for importing the db file and H2OGeneralizedLinearEstimator for using linear regression

In [None]:
import sqlite3
import pandas as pd
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator
import matplotlib.pyplot as plt

This code is used to initialize our H2O cluster so we can use its functionalities

In [None]:
# Start H2O instance
h2o.init()

Here we connect to our data file database_datavis.db using SQLite3 database

In [None]:
# Connect to database file
data = sqlite3.connect("database_datavis.db")

This code writes the query to get our data out of the database

In [None]:
# Query the table
emissions_query = "SELECT * FROM emissions"

Here we load the data in an variable so we can begin working with the data 

In [None]:
# Load data into DataFrames
emissions = pd.read_sql_query(emissions_query, data)

We don't need anything anymore from the database so we disconnect from it

In [None]:
# Close the connection
data.close()

We have a date column but it would be better to only have the year stores so it is easier to read. So we first convert the column to a datetime column. If it cannot be converted to datetime then it will be set to NaT because we had some problems with it before the coerce addition. Lastly we substract the year from the date column and make a new column with it.

In [None]:
# Create a year column 
emissions = emissions[pd.to_datetime(emissions['Date'], format='%d-%m-%Y', errors='coerce').notnull()]
emissions['Year'] = pd.to_datetime(emissions['Date']).dt.year

Here we are calculating the mean of the metric tons per capita of emissions. We group by year so we can see the average of the world each year.

In [None]:
# Calculate mean CO2 emissions per year
mean_emissions = emissions.groupby('Year')['Metric_Tons_Per_Capita'].mean().reset_index()

Here we convert our data to a h2o dataframe so we can use the h2o algorithms

In [None]:
# Convert the dataframe to H2O frame
h2o_df = h2o.H2OFrame(emissions)

y is the column which we want to predict and x is the colum that contains the features used to make the predictions

In [None]:
# Specify the response and predictor variables
y = 'Metric_Tons_Per_Capita'
x = ['Year']

Here we actually train the model with linear regression

In [None]:
# Train the model using H2O Linear Regression
prediction_model = H2OGeneralizedLinearEstimator()
prediction_model.train(x=x, y=y, training_frame=h2o_df)

We can check the performance of the prediction model with this function

In [None]:
# View the model performance
performance = prediction_model.model_performance()
print(performance)

Here we make a list with all the years that need to be predicted

In [None]:
# Predict future CO2 emissions for year 2020 to 2040
future_years = pd.DataFrame({'Year': range(2020, 2041)})
future_predictions = []

With this code we predict future CO2 emissions for each year listed in future_years. We also put the new column in variable called future_predictions

In [None]:
# Predict for every year in future years
for year in future_years['Year']:
    future_year_df = pd.DataFrame({'Year': [year]})
    future_h2o_df = h2o.H2OFrame(future_year_df)
    prediction = prediction_model.predict(future_h2o_df)
    future_predictions.append(prediction.as_data_frame().iloc[0, 0])

future_years['Predicted_CO2_Emissions'] = future_predictions

Finally we plot the whole data and prediction to show our results.

In [None]:
# Plot the historical and predicted CO2 emissions
plt.figure(figsize=(10, 6))
plt.plot(mean_emissions['Year'], mean_emissions['Metric_Tons_Per_Capita'], label='Historical CO2 Emissions')
plt.plot(future_years['Year'], future_years['Predicted_CO2_Emissions'], label='Predicted CO2 Emissions', linestyle='--')
plt.xlabel('Year')
plt.ylabel('CO2 Emissions')
plt.title('Historical and Predicted CO2 Emissions')
plt.legend()
plt.show()

Lastly we shutdown the h2o cluster

In [None]:
h2o.shutdown()