In [9]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score,StratifiedKFold
import sklearn.metrics as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn              import svm

%matplotlib inline

In [10]:
df = pd.read_csv('dataset/CarbonDioxide/owid-co2-data.csv')
df

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_cumulative_other_co2,share_global_flaring_co2,share_global_gas_co2,share_global_luc_co2,share_global_oil_co2,share_global_other_co2,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,0.121,,,,,,
1,Afghanistan,1851,AFG,3769828.0,,,,,,,...,,,,0.118,,,,,,
2,Afghanistan,1852,AFG,3787706.0,,,,,,,...,,,,0.116,,,,,,
3,Afghanistan,1853,AFG,3806634.0,,,,,,,...,,,,0.115,,,,,,
4,Afghanistan,1854,AFG,3825655.0,,,,,,,...,,,,0.114,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46518,Zimbabwe,2017,ZWE,14751101.0,2.194784e+10,0.469,0.032,9.596,-0.937,-8.899,...,,0.0,0.0,0.219,0.026,,115.59,28.30,0.910,9.486
46519,Zimbabwe,2018,ZWE,15052191.0,2.271535e+10,0.558,0.037,11.795,2.199,22.920,...,,0.0,0.0,0.211,0.033,,118.22,30.83,0.771,6.537
46520,Zimbabwe,2019,ZWE,15354606.0,,0.570,0.037,11.115,-0.681,-5.772,...,,0.0,0.0,0.183,0.030,,117.96,30.53,0.978,8.795
46521,Zimbabwe,2020,ZWE,15669663.0,,0.570,0.036,10.608,-0.507,-4.559,...,,0.0,0.0,0.194,0.030,,,,1.006,9.481


In [12]:
df = df.dropna()
df = df.loc[:, df.columns.isin(['country' , 'year' , 'population', 'gdp', 'co2', 'methane', 'nitrous_oxide'])]

df = df[df['country'] == 'China']
# df = df.drop(['index'], axis=1)
df

Unnamed: 0,country,year,population,gdp,co2,methane,nitrous_oxide
8959,China,1990,1153704000.0,3385122000000.0,2484.855,765.84,296.17
8960,China,1991,1170626000.0,3508948000000.0,2606.096,775.53,303.72
8961,China,1992,1183813000.0,3755037000000.0,2730.788,781.19,309.6
8962,China,1993,1195856000.0,4126170000000.0,2921.651,790.54,301.41
8963,China,1994,1207287000.0,4438753000000.0,3100.002,812.96,314.95
8964,China,1995,1218144000.0,4810000000000.0,3357.909,850.31,358.59
8965,China,1996,1228299000.0,5127590000000.0,3503.234,858.79,384.61
8966,China,1997,1237801000.0,5291339000000.0,3510.169,813.78,356.53
8967,China,1998,1246836000.0,5338703000000.0,3360.455,815.85,365.59
8968,China,1999,1255433000.0,5578503000000.0,3552.842,810.42,381.21


In [13]:
X = df['year'].values.reshape(-1, 1)
Y = df['co2']
X

array([[1990],
       [1991],
       [1992],
       [1993],
       [1994],
       [1995],
       [1996],
       [1997],
       [1998],
       [1999],
       [2000],
       [2001],
       [2002],
       [2003],
       [2004],
       [2005],
       [2006],
       [2007],
       [2008],
       [2009],
       [2010],
       [2011],
       [2012],
       [2013],
       [2014],
       [2015],
       [2016],
       [2017],
       [2018]], dtype=int64)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
Data_list = [X_train, X_test, y_train, y_test]

In [15]:
Kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
def Model_Selection(Model,Data_list, name):
    Model.fit(Data_list[0],Data_list[2])
    y_Pred = Model.predict(Data_list[1])

    MAE = round(sm.mean_absolute_error(y_test, y_Pred), 2)
    MSE = round(sm.mean_squared_error(y_test, y_Pred), 2)
    MeAE = round(sm.median_absolute_error(y_test, y_Pred), 2)
    Variance = round(sm.explained_variance_score(y_test, y_Pred), 2)
    R2_Score =  round(sm.r2_score(y_test, y_Pred), 2)

    
    frame = pd.DataFrame({'Model': [name], 'MeanAbsoluteError': [MAE], 'MeanSquaredError': [MSE], 'MedianAbsoluteError': [MeAE], 'Variance': [Variance], 'R2_score':[R2_Score]})
    return frame

In [16]:
model1 = LinearRegression()
model2 = svm.SVR()
model3 = ElasticNet()
model4 = Lasso(alpha=0.1)
model5 = BayesianRidge()
Model_data = pd.concat([
                       Model_Selection(model1, Data_list, 'LinearRegression'),
                       Model_Selection(model2, Data_list, 'SupportVectorRegression'),
                       Model_Selection(model3, Data_list, 'ElasticNet'),
                       Model_Selection(model4, Data_list, 'Lasso'),
                       Model_Selection(model5, Data_list, 'BaysianRidge')],axis = 0).reset_index()
Model_data

Unnamed: 0,index,Model,MeanAbsoluteError,MeanSquaredError,MedianAbsoluteError,Variance,R2_score
0,0,LinearRegression,769.44,752052.12,863.87,0.92,0.92
1,0,SupportVectorRegression,2999.01,11585877.0,2534.46,0.0,-0.25
2,0,ElasticNet,765.28,756279.85,874.31,0.92,0.92
3,0,Lasso,769.44,752054.42,863.88,0.92,0.92
4,0,BaysianRidge,767.77,753635.46,868.07,0.92,0.92


In [17]:
Model = Lasso()
Model.fit(Data_list[0],Data_list[2])
y_Pred = Model.predict([[2024], [2025], [2100]])
y_Pred

array([12606.36471681, 12933.94822124, 37502.7110531 ])