In [None]:
#  Import libraries

import pandas as pd 
import numpy as np
import matplotlib.pyplot as pplot
import sklearn as scikit

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Libraries for working with MongoDB

import pymongo as py

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [None]:
# connect to MongoDB client

uri = ""

client = MongoClient(uri,
                     tls=True,
                     tlsCertificateKeyFile= '',
                     server_api=ServerApi('1'))

In [None]:
# Import carbon emissions dataset.

db = client["RegressionsSources"]

collection_country = db["CarbonEmissionsByCountry"]
collection_state = db["CarbonEmissionsByState"]

rows_country = collection_country.find()
rows_state = collection_state.find()

emissions_by_country = pd.DataFrame(rows_country)
emissions_by_state = pd.DataFrame(rows_state)

In [None]:
# Filter global data to US

ce_global_filter = emissions_by_country[emissions_by_country['Country'] == 'United States']
ce_global_filter

In [None]:
# Data preparation for joining.

ce_global_filter['Year'] = ce_global_filter.apply(lambda x: pd.Series(pd.to_datetime(x[3]).year), axis = 1)
ce_global_filter = ce_global_filter.filter(items=['Country', 'Year', 'Kilotons of Co2'])

In [None]:
# Enrich carbon emissions dataset with million tons of CO2 ("value") by state, sector, and fuel dimensions.

rows_merged = ce_global_filter.merge(emissions_by_state, left_on='Year', right_on= 'year')

rows_merged_filter = rows_merged.filter(items=['Year', 'Kilotons of Co2', 'value'])

rows_emissions_final = rows_merged_filter.rename(columns={'value': 'Million Metric Tons'})

In [None]:
# Copy final data rows. 

data = rows_emissions_final.copy()

# Dependent variable

x_dep = data.drop(columns = {'Year', 'Million Metric Tons'}, axis = 1)

# Independent variable

y_indep = data.drop('Kilotons of Co2', axis = 1)

In [None]:
# Train and test data.

x_dep_train, x_dep_test, y_indep_train, y_indep_test = train_test_split(x_dep, y_indep, test_size=0.1, random_state=30)

In [None]:
# Linear regression output

linear_regression = LinearRegression()

lr = linear_regression.fit(x_dep_train, y_indep_train)

lr

In [None]:
# Regression intercepts and coefficients

intercept = lr.intercept_
coefficients = lr.coef_

In [None]:
# Regression training predictions

train_indep_pred = lr.predict(x_dep_train)
test_indep_pred = lr.predict(x_dep_test)

In [None]:
# Root squared mean

train_r2 = r2_score(y_true= y_indep_train, y_pred = train_indep_pred)
test_r2 = r2_score(y_true= y_indep_test, y_pred = test_indep_pred)

In [None]:
# Matplotlib visuals

pplot.title("Carbon Emissions Regressions Analysis")
pplot.scatter(y_indep_train, train_indep_pred)
pplot.xlabel("Actual")
pplot.ylabel("Prediction")
pplot.show()