In [None]:
import pandas as pd
data= pd.read_csv('global-data-on-sustainable-energy.csv')
missing_values = data.isnull().sum()
length=len(data)
missing_percentage = (missing_values / len(data)) * 100
print(missing_values)

In [None]:
missing_percentage

In [None]:
data_types = data.dtypes
print(data_types)

In [None]:
def remove_functuation(density):
    return density.replace(',','')
data['Entity']=data['Entity'].astype(str)
data['Density\\n(P/Km2)']=data['Density\\n(P/Km2)'].astype(str)
data['Density\\n(P/Km2)']=data['Density\\n(P/Km2)'].apply(remove_functuation)
data['Density\\n(P/Km2)']=data['Density\\n(P/Km2)'].astype(float)

In [None]:
#https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns
#https://www.geeksforgeeks.org/remove-multiple-elements-from-a-list-in-python/
columns=data.columns.tolist()
feature_with_null=[column for column in columns if column not in ['Entity','Year','Primary energy consumption per capita (kWh/person)']]
data.fillna(data[feature_with_null].mean(),inplace=True)
data.dropna(inplace=True)
missing_values = data.isnull().sum()
print(missing_values)

In [None]:
df = data
df.head()

In [None]:
def rmOutliers(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# List of columns to check for outliers
columns_to_check = [
    'Access to electricity (% of population)',
    'Access to clean fuels for cooking',
    'Renewable-electricity-generating-capacity-per-capita',
    'Financial flows to developing countries (US $)',
    'Renewable energy share in the total final energy consumption (%)',
    'Electricity from fossil fuels (TWh)',
    'Electricity from nuclear (TWh)',
    'Electricity from renewables (TWh)',
    'Primary energy consumption per capita (kWh/person)',
    'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
    'Value_co2_emissions_kt_by_country',
    'Renewables (% equivalent primary energy)',
    'gdp_growth',
    'gdp_per_capita',
]
print (columns_to_check)

# Apply the function to each group of entities
df_cleaned = df.groupby('Entity').apply(lambda x: rmOutliers(x, columns_to_check)).reset_index(drop=True)

In [None]:
print(f"Total Outliers Removed considering for each country are:{df.count()-df_cleaned.count()}")

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
scaler= StandardScaler()
columns=data.select_dtypes(include=['float','integer']).columns
data[columns]=scaler.fit_transform(data[columns])

In [None]:
label_encoder=LabelEncoder()
data['Entity']=label_encoder.fit_transform(data['Entity'])

In [None]:
target_variables = ['Value_co2_emissions_kt_by_country', 'Renewable energy share in the total final energy consumption (%)']


In [None]:
# importing all the necessary models that are required to perform the regression
#https://stackoverflow.com/questions/59489830/select-best-parameters-for-regression-model-using-gridsearch
#https://www.kdnuggets.com/hyperparameter-tuning-gridsearchcv-and-randomizedsearchcv-explained
#https://www.analyticsvidhya.com/blog/2022/11/hyperparameter-tuning-using-randomized-search/
#https://dev.to/newbie_coder/decision-tree-regression-a-comprehensive-guide-with-python-code-examples-and-hyperparameter-tuning-1f0f
#https://stats.stackexchange.com/questions/269053/how-to-select-hyperparameters-for-svm-regression-after-grid-search
#https://www.geeksforgeeks.org/random-forest-hyperparameter-tuning-in-python/
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint,uniform

models={
    'Linear Regression':LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Support Vector Regressor':SVR(),
    'Random Forest Regressor':RandomForestRegressor()
}
param_grids = {
    'Linear Regression': {
              "fit_intercept": [True, False],
             },
    'Decision Tree Regression': {
        'max_depth': [None, 5, 10],
        'min_samples_split': randint(2, 10),
        'min_samples_leaf': randint(1, 4)
    },
    'Support Vector Regressor': {
        'kernel': ('linear', 'rbf','poly'), 
        'C':uniform(1.5, 10),
        'gamma': uniform(1e-7, 1e-4),
        'epsilon':uniform(0.1,0.5)
    },
    'Random Forest Regressor': {
         'n_estimators':  randint(25, 150), 
         'max_features': ['sqrt', 'log2', None], 
         'max_depth':  randint(3, 9), 
         'max_leaf_nodes': randint(3, 9),
    }
}

In [None]:
data.head()

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

def evaluate_model_with_base_parameters(models_dict):
  for i in range(len(target_variables)):
    print(f"Below are the metrics for the target variable: {target_variables[i]}")
    for name, model in models_dict.items():
      if name == 'Support Vector Regressor':
            # Skipping RFE for SVR as it does not have the feature_importances_ or coef_ attribute
            correlation_matrix =  data.select_dtypes("number").corr()
            target_features = correlation_matrix[target_variables[i]].drop(target_variables[i])
            target_features = target_features[abs(target_features)>0.3].index.tolist()
            X = data.drop(columns=target_features)
            y = data[target_variables[i]]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            model.fit(X_train, y_train)
            #y_pred = model.predict(X_test)

      else:
            # Applying RFE for other models
            rfe = RFE(estimator=model, n_features_to_select=10)
            X = data.drop(columns=target_variables)
            y = data[target_variables[i]]
            fit = rfe.fit(X, y)
            feature_ranking = pd.DataFrame({'Feature': X.columns, 'Ranking': fit.ranking_}).sort_values(by='Ranking')
            final_features = feature_ranking[feature_ranking['Ranking'] == 1]['Feature'].to_list()
            if 'Entity' not in final_features:
                final_features.append('Entity')
            X = X[final_features]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            model.fit(X_train, y_train)
            #y_pred = model.predict(X_test)
      grid_search = RandomizedSearchCV(estimator=model,  param_distributions=param_grids[name], cv=3, scoring='r2',n_iter=4 )
      grid_search.fit(X_train, y_train)
            
      best_model = grid_search.best_estimator_
      y_pred = best_model.predict(X_test)
            
      

      mse = mean_squared_error(y_test, y_pred)
      mae = mean_absolute_error(y_test, y_pred)
      r2 = r2_score(y_test, y_pred)

      print(f"Model: {name}")
      print(f"Best Parameters: {grid_search.best_params_}")
      print(f"Mean squared error is: {mse}")
      print(f"Mean absolute error is: {mae}")
      print(f"R2 score error is: {r2}")
      print("\n")


# now calling the function in order to get the outputs
evaluate_model_with_base_parameters(models)


# Neural Network Training

In [None]:
%pip install tensorflow

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
correlation_matrix =  data.select_dtypes("number").corr()
target_features = correlation_matrix[target_variables[0]].drop(target_variables[0])
target_features = target_features[abs(target_features)>0.3].index.tolist()

In [None]:
X=data[target_features]
y=data[target_variables[0]]

X_train, X_test, y_train, y_test= train_test_split(X,y, random_state=42, test_size=0.2)

In [None]:
neural_network=keras.Sequential([
    layers.Input(shape=(4,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

In [None]:
neural_network.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
#https://www.tensorflow.org/tutorials/keras/keras_tuner
'''
import keras_tuner as kt
tuner = kt.Hyperband(neural_network,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')'''

In [None]:
neural_network.fit(X_train, y_train, epochs=40, batch_size=32)

In [None]:
y_pred=neural_network.predict(X_test)

# calculating the scores based on the performance of the neural network

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"Mean squared error (same as neural_network.evaluate()) is: {mse}")
print(f"Mean absolute error is: {mae}")
print(f"R2 score error is: {r2}")

In [None]:
test_loss = neural_network.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}")