In [1]:
#  Import libraries

import pandas as pd 
import numpy as np
import matplotlib as mplib
import matplotlib.pyplot as pplot
#import seaborn as seab
import sklearn as scikit

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Import carbon emissions dataset.
ce_global = pd.read_csv("data/Carbon_(CO2)_Emissions_by_Country.csv")
print(ce_global.head())

       Country Region        Date  Kilotons of Co2  Metric Tons Per Capita
0  Afghanistan   Asia  01-01-2011           8930.0                    0.31
1  Afghanistan   Asia  01-01-2012           8080.0                    0.27
2  Afghanistan   Asia  01-01-2010           7110.0                    0.25
3  Afghanistan   Asia  01-01-2019           6080.0                    0.16
4  Afghanistan   Asia  01-01-2018           6070.0                    0.17


In [3]:
# Filter global data to US
ce_global_filter = ce_global[ce_global['Country'] == 'United States']
ce_global_filter

Unnamed: 0,Country,Region,Date,Kilotons of Co2,Metric Tons Per Capita
5407,United States,Americas,01-01-2000,5775810.0,20.47
5408,United States,Americas,01-01-2005,5753490.23,19.47
5409,United States,Americas,01-01-2001,5748260.0,20.17
5410,United States,Americas,01-01-2004,5738290.04,19.6
5411,United States,Americas,01-01-2007,5736319.82,19.04
5412,United States,Americas,01-01-2003,5658990.23,19.51
5413,United States,Americas,01-01-2006,5653080.08,18.95
5414,United States,Americas,01-01-1999,5609020.0,20.1
5415,United States,Americas,01-01-2002,5593029.79,19.45
5416,United States,Americas,01-01-1998,5590540.0,20.27


In [4]:
# Data preparation for joining.
ce_global_filter['Year'] = ce_global_filter.apply(lambda x: pd.Series(pd.to_datetime(x[2]).year), axis = 1)
ce_global_filter = ce_global_filter.filter(items=['Country', 'Year', 'Kilotons of Co2'])
ce_global_filter 

  ce_global_filter['Year'] = ce_global_filter.apply(lambda x: pd.Series(pd.to_datetime(x[2]).year), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ce_global_filter['Year'] = ce_global_filter.apply(lambda x: pd.Series(pd.to_datetime(x[2]).year), axis = 1)


Unnamed: 0,Country,Year,Kilotons of Co2
5407,United States,2000,5775810.0
5408,United States,2005,5753490.23
5409,United States,2001,5748260.0
5410,United States,2004,5738290.04
5411,United States,2007,5736319.82
5412,United States,2003,5658990.23
5413,United States,2006,5653080.08
5414,United States,1999,5609020.0
5415,United States,2002,5593029.79
5416,United States,1998,5590540.0


In [5]:
# Import carbon emissions dataset.
ce_US = pd.read_csv("data/CO2_Emissions_US.csv")
print(ce_US.head())

   year state-name                                      sector-name  \
0  1970    Alabama              Industrial carbon dioxide emissions   
1  1970    Alabama              Industrial carbon dioxide emissions   
2  1970    Alabama              Industrial carbon dioxide emissions   
3  1970    Alabama              Industrial carbon dioxide emissions   
4  1970    Alabama  Total carbon dioxide emissions from all sectors   

     fuel-name       value  
0         Coal   26.721507  
1    Petroleum    3.577779  
2  Natural Gas    8.944097  
3    All Fuels   39.243383  
4    All Fuels  102.646851  


In [6]:
# Enrich carbon emissions dataset with million tons of CO2 ("value") by state, sector, and fuel dimensions.
ce_merged = ce_global_filter.merge(ce_US, left_on='Year', right_on= 'year')

ce_merged_filter = ce_merged.filter(items=['Year', 'state-name', 'Kilotons of Co2', 'value'])

ce_rename = ce_merged_filter.rename(columns={'state-name': 'State', 'value': 'Million Metric Tons'})

ce_rename

Unnamed: 0,Year,State,Kilotons of Co2,Million Metric Tons
0,2000,Alabama,5775810.0,0.014109
1,2000,Alabama,5775810.0,18.997993
2,2000,Alabama,5775810.0,85.908083
3,2000,Alabama,5775810.0,142.298577
4,2000,Alabama,5775810.0,25.287666
...,...,...,...,...
34213,1991,Wyoming,4807500.0,9.717897
34214,1991,Wyoming,4807500.0,55.495746
34215,1991,Wyoming,4807500.0,42.912113
34216,1991,Wyoming,4807500.0,7.288282


In [7]:
# Copy joined data. 
data = ce_rename.copy()

#Dependent variable
x_dep = data.drop(columns = {'Year', 'State', 'Million Metric Tons'}, axis = 1)

#Independent variable
y_indep = data.drop('Kilotons of Co2', axis = 1)

print(x_dep.head())

print(y_indep.head())

   Kilotons of Co2
0        5775810.0
1        5775810.0
2        5775810.0
3        5775810.0
4        5775810.0
   Year    State  Million Metric Tons
0  2000  Alabama             0.014109
1  2000  Alabama            18.997993
2  2000  Alabama            85.908083
3  2000  Alabama           142.298577
4  2000  Alabama            25.287666


In [8]:
x_dep.dtypes #select_dtypes(include='object').columns.to_list()

Kilotons of Co2    float64
dtype: object

In [9]:
# Convert categorical values to indicators 

y_indicators = pd.get_dummies(data=y_indep, columns=y_indep.select_dtypes(include='object').columns.to_list(),
              drop_first=True,
              dtype=int)

print(y_indicators.head())

   Year  Million Metric Tons  State_Alaska  State_Arizona  State_Arkansas  \
0  2000             0.014109             0              0               0   
1  2000            18.997993             0              0               0   
2  2000            85.908083             0              0               0   
3  2000           142.298577             0              0               0   
4  2000            25.287666             0              0               0   

   State_California  State_Colorado  State_Connecticut  State_Delaware  \
0                 0               0                  0               0   
1                 0               0                  0               0   
2                 0               0                  0               0   
3                 0               0                  0               0   
4                 0               0                  0               0   

   State_District of Columbia  ...  State_Tennessee  State_Texas  \
0                       

In [10]:
# Train and test data.

x_dep_train, x_dep_test, y_indep_train, y_indep_test = train_test_split(x_pred, y_indicators, test_size=0.1, random_state=30)

print(x_dep_train.head())
print(y_indep_train.head())

NameError: name 'x_pred' is not defined

In [None]:
# Linear regression output

linear_regression = LinearRegression()

lr = linear_regression.fit(x_dep_train, y_indep_train)

lr

In [None]:
intercept = lr.intercept_
coefficients = lr.coef_

print(intercept)
print(coefficients)

In [None]:
train_dep_pred = lr.predict(x_dep_train)
test_dep_pred = lr.predict(x_dep_test)

print(train_dep_pred)
print(t)

In [None]:
train_r2 = r2_score(y_true= y_indep_train, y_pred = train_dep_pred)
test_r2 = r2_score(y_true= y_indep_test, y_pred = test_dep_pred)

print(train_r2)
print(test_r2)