In [None]:


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error



In [None]:
df=pd.read_csv('global-data-on-sustainable-energy.csv')
df=df.rename(columns={'Value_co2_emissions_kt_by_country':'CO2'})

In [None]:
#Perform shifting step on label column
"""year_prediction_window = 19

df = df.groupby("Entity")
entities = []
for entity_group in df.groups.keys():
    year_shifted_entity_group = df.get_group(entity_group)
    year_shifted_entity_group["CO2"] = year_shifted_entity_group["CO2"].shift(-1 * year_prediction_window)
    entities.append(year_shifted_entity_group)
df = pd.concat(entities)"""




'year_prediction_window = 19\n\ndf = df.groupby("Entity")\nentities = []\nfor entity_group in df.groups.keys():\n    year_shifted_entity_group = df.get_group(entity_group)\n    year_shifted_entity_group["CO2"] = year_shifted_entity_group["CO2"].shift(-1 * year_prediction_window)\n    entities.append(year_shifted_entity_group)\ndf = pd.concat(entities)'

In [None]:

#Fill in missing values in the C02 column with the last known value
df['CO2'].fillna(method='ffill', inplace=True)


In [None]:
plt.figure(figsize=(30,30))
Corr_Matrix=df.corr()
sns.heatmap(df.corr(), annot=True, fmt=".2f")
#plt.show()

In [None]:
print('Top 5 Most Positively Correlated to the Target Variable')
Corr_Matrix['CO2'].sort_values(ascending=False).head(5)

Top 5 Most Positively Correlated to the Target Variable


CO2                                    1.000000
Electricity from fossil fuels (TWh)    0.989334
Electricity from renewables (TWh)      0.858493
Land Area(Km2)                         0.648807
Electricity from nuclear (TWh)         0.603900
Name: CO2, dtype: float64

In [None]:
print('Top 5 Most Negatively Correlated to the Target Variable')
Corr_Matrix['CO2'].sort_values(ascending=True).head(5)

Top 5 Most Negatively Correlated to the Target Variable


Renewable energy share in the total final energy consumption (%)   -0.121950
Renewables (% equivalent primary energy)                           -0.068848
Low-carbon electricity (% electricity)                             -0.045195
Year                                                                0.022515
gdp_growth                                                          0.030372
Name: CO2, dtype: float64

In [None]:
columns_to_drop = [col for col in Corr_Matrix.columns if abs(Corr_Matrix.loc['CO2', col]) < 0.5]
columns_to_drop
df = df.drop(columns_to_drop, axis=1)
df=df.dropna()

In [None]:
df.rename(columns={'Density\\n(P/Km2)': 'New_Density'}, inplace=True)
df['New_Density'] = df['New_Density'].str.replace(',', '').astype(int)
df.rename(columns={'Land Area(Km2)': 'Land'}, inplace=True)
#df

In [None]:
le = LabelEncoder()
df.Entity = le.fit_transform(df.Entity)
#print(df.Entity)

In [None]:
"""X = df.drop(columns=['CO2'])
y = df['CO2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)"""
#Split data into train/test split

#valid_entities = df.groupby("Entity").filter(lambda x: len(x) >= 20)
#split_index = valid_entities.groupby("Entity").head(10).index.max() + 1  #19 rows for training, 1 for testing

split_index = df.groupby("Entity").size().min()  #Take the minimum number of rows for any entity
#in this case the minimum is 8, so its predicting 16 years in the future

train_data = df.iloc[:split_index]
test_data = df.iloc[split_index:]

X_train = train_data.drop(columns=['CO2'])
y_train = train_data['CO2']
X_test = test_data.drop(columns=['CO2'])
y_test = test_data['CO2']

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(8, 6) (8,)
(3514, 6) (3514,)


In [None]:
print(X_train)

   Entity  Electricity from fossil fuels (TWh)  \
0       0                                 0.16   
1       0                                 0.09   
2       0                                 0.13   
3       0                                 0.31   
4       0                                 0.33   
5       0                                 0.34   
6       0                                 0.20   
7       0                                 0.20   

   Electricity from nuclear (TWh)  Electricity from renewables (TWh)  \
0                             0.0                               0.31   
1                             0.0                               0.50   
2                             0.0                               0.56   
3                             0.0                               0.63   
4                             0.0                               0.56   
5                             0.0                               0.59   
6                             0.0            

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual CO2'] = y_test
    submit['Predict_CO2'] = y_pred
    submit = submit.reset_index()
    r2 = r2_score(y_test, y_pred)
    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print(submit.head(5))

    print('----------------------------------------')
print(f"The best performing model is: {best_model} with r2: {best_r2:.2f}")

Linear Regression:
R2 Score: 0.77
Mean Absolute Error (MAE): 80484.27
Root Mean Squared Error (RMSE): 367746.48
   index   Actual CO2  Predict_CO2
0      8  3559.999943  1145.039979
1      9  4880.000114  1729.268717
2     10  7110.000134  1671.365724
3     11  8930.000305  1289.656630
4     12  8079.999924  1617.491929
----------------------------------------
Random Forest:
R2 Score: -0.04
Mean Absolute Error (MAE): 147756.69
Root Mean Squared Error (RMSE): 781600.73
   index   Actual CO2  Predict_CO2
0      8  3559.999943  1061.999980
1      9  4880.000114  1585.599985
2     10  7110.000134  1695.799985
3     11  8930.000305  1443.599994
4     12  8079.999924  1577.899985
----------------------------------------
The best performing model is: LinearRegression with r2: 0.77


In [None]:
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print(f'Cross-Validation R2 Scores: {cv_scores}')

Cross-Validation R2 Scores: [-6.38067522e+02 -7.47446088e+00  1.65209751e-01             nan
             nan]


