In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer


In [34]:
# Load the dataset from the provided URL
file_path = "global_Power_plant_database.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


  country country_long                      name   gppd_idnr  capacity_mw  \
0     IND        India          ACME Solar Tower  WRI1020239          2.5   
1     IND        India       ADITYA CEMENT WORKS  WRI1019881         98.0   
2     IND        India  AES Saurashtra Windfarms  WRI1026669         39.2   
3     IND        India               AGARTALA GT  IND0000001        135.0   
4     IND        India              AKALTARA TPP  IND0000002       1800.0   

   latitude  longitude primary_fuel other_fuel1 other_fuel2  ...  \
0   28.1839    73.2407        Solar         NaN         NaN  ...   
1   24.7663    74.6090         Coal         NaN         NaN  ...   
2   21.9038    69.3732         Wind         NaN         NaN  ...   
3   23.8712    91.3602          Gas         NaN         NaN  ...   
4   21.9603    82.4091         Coal         Oil         NaN  ...   

                     geolocation_source  wepp_id year_of_capacity_data  \
0  National Renewable Energy Laboratory      NaN      

In [9]:
# Check for missing values in the dataset
print("Missing values in each column before imputation:")
print(data.isnull().sum())


Missing values in each column before imputation:
country                       0
country_long                  0
name                          0
gppd_idnr                     0
capacity_mw                   0
latitude                     46
longitude                    46
primary_fuel                  0
other_fuel1                 709
other_fuel2                 907
other_fuel3                 908
commissioning_year          380
owner                       566
source                        0
url                           0
geolocation_source           19
wepp_id                     908
year_of_capacity_data       388
generation_gwh_2013         524
generation_gwh_2014         507
generation_gwh_2015         483
generation_gwh_2016         471
generation_gwh_2017         465
generation_data_source      458
estimated_generation_gwh    908
dtype: int64


In [14]:
# Step 4: Handle Missing Values

# Identify numeric columns for imputation
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
print("Original numeric columns:", numeric_cols)

# Check for columns with all NaN values
all_nan_columns = data[numeric_cols].columns[data[numeric_cols].isnull().all()].tolist()
print("Columns with all NaN values:", all_nan_columns)

# Use SimpleImputer to fill NaN values with the mean
imputer = SimpleImputer(strategy='mean')

# Drop columns with all NaN values from numeric_cols before fitting the imputer
valid_numeric_cols = [col for col in numeric_cols if col not in all_nan_columns]

# Fit the imputer and transform only the valid numeric columns
imputed_values = imputer.fit_transform(data[valid_numeric_cols])

# Check the shape of the imputed values
print("Shape of imputed values:", imputed_values.shape)

# Create a DataFrame from the imputed values with original numeric column names
data_imputed = pd.DataFrame(imputed_values, columns=valid_numeric_cols)

# Combine the imputed numeric data with the non-numeric columns
non_numeric_cols = data.select_dtypes(include=['object']).columns.tolist()

# Rebuild the final DataFrame, ensuring to include all non-numeric columns
data_final = pd.concat([data_imputed, data[non_numeric_cols].reset_index(drop=True)], axis=1)

# Confirm that there are no more missing values
print("Missing values in each column after imputation:")
print(data_final.isnull().sum())

# Check the shape of the final DataFrame
print(f"Shape of final data after imputation: {data_final.shape}")


Original numeric columns: ['capacity_mw', 'latitude', 'longitude', 'other_fuel3', 'commissioning_year', 'wepp_id', 'year_of_capacity_data', 'generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016', 'generation_gwh_2017', 'estimated_generation_gwh']
Columns with all NaN values: ['other_fuel3', 'wepp_id', 'estimated_generation_gwh']
Shape of imputed values: (908, 10)
Missing values in each column after imputation:
capacity_mw                 0
latitude                    0
longitude                   0
commissioning_year          0
year_of_capacity_data       0
generation_gwh_2013         0
generation_gwh_2014         0
generation_gwh_2015         0
generation_gwh_2016         0
generation_gwh_2017         0
country                     0
country_long                0
name                        0
gppd_idnr                   0
primary_fuel                0
other_fuel1               709
other_fuel2               907
owner                     566
source    

In [15]:
# Combine the imputed numeric data with the non-numeric columns
non_numeric_cols = data.select_dtypes(include=['object']).columns.tolist()
data_final = pd.concat([data_imputed, data[non_numeric_cols].reset_index(drop=True)], axis=1)

# Check the shape of the final DataFrame
print(f"Shape of final data after combining: {data_final.shape}")

# Check for missing values again after imputation
print("Missing values in each column after combining:")
print(data_final.isnull().sum())


Shape of final data after combining: (908, 22)
Missing values in each column after combining:
capacity_mw                 0
latitude                    0
longitude                   0
commissioning_year          0
year_of_capacity_data       0
generation_gwh_2013         0
generation_gwh_2014         0
generation_gwh_2015         0
generation_gwh_2016         0
generation_gwh_2017         0
country                     0
country_long                0
name                        0
gppd_idnr                   0
primary_fuel                0
other_fuel1               709
other_fuel2               907
owner                     566
source                      0
url                         0
geolocation_source         19
generation_data_source    458
dtype: int64


In [36]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in data_final.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_final[column] = le.fit_transform(data_final[column].astype(str))
    label_encoders[column] = le  # Save the encoder for later use if needed


In [37]:
# Model Training for Primary Fuel Prediction
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Split the data with stratification
X_train_fuel, X_test_fuel, y_train_fuel, y_test_fuel = train_test_split(
    X_fuel, y_fuel, test_size=0.2, random_state=42, stratify=y_fuel
)

# Check class distribution
print("Training set class distribution:")
print(y_train_fuel.value_counts())
print("\nTest set class distribution:")
print(y_test_fuel.value_counts())

# Create and fit the model
rf_fuel = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_fuel.fit(X_train_fuel, y_train_fuel)

# Predict and evaluate
y_pred_fuel = rf_fuel.predict(X_test_fuel)

print("Predicted class distribution:")
print(pd.Series(y_pred_fuel).value_counts())

# Calculating accuracy and generating classification report
accuracy = accuracy_score(y_test_fuel, y_pred_fuel)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test_fuel, y_pred_fuel, zero_division=0))


Training set class distribution:
primary_fuel
1    207
3    200
6    102
7     98
2     55
0     40
5     17
4      7
Name: count, dtype: int64

Test set class distribution:
primary_fuel
1    52
3    50
7    25
6    25
2    14
0    10
5     4
4     2
Name: count, dtype: int64
Predicted class distribution:
3    61
1    57
7    26
6    25
0     7
2     5
5     1
Name: count, dtype: int64
Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.86      0.60      0.71        10
           1       0.89      0.98      0.94        52
           2       0.80      0.29      0.42        14
           3       0.82      1.00      0.90        50
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         4
           6       1.00      1.00      1.00        25
           7       0.96      1.00      0.98        25

    accuracy                           0.88       182
   macro avg       0.67      0.61      0.62       182


In [29]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 8: Split the data into training and testing sets
X_train_fuel, X_test_fuel, y_train_fuel, y_test_fuel = train_test_split(
    X_fuel, y_fuel, test_size=0.2, random_state=42, stratify=y_fuel
)

# Create and fit the Random Forest model
rf_fuel = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_fuel.fit(X_train_fuel, y_train_fuel)

# Predicting on the test set
y_pred_fuel = rf_fuel.predict(X_test_fuel)

# Calculating accuracy and generating classification report
accuracy = accuracy_score(y_test_fuel, y_pred_fuel)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report
print(classification_report(y_test_fuel, y_pred_fuel, zero_division=0))


Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.86      0.60      0.71        10
           1       0.89      0.98      0.94        52
           2       0.80      0.29      0.42        14
           3       0.82      1.00      0.90        50
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         4
           6       1.00      1.00      1.00        25
           7       0.96      1.00      0.98        25

    accuracy                           0.88       182
   macro avg       0.67      0.61      0.62       182
weighted avg       0.86      0.88      0.86       182



In [30]:
# Importing joblib to save the model
import joblib

# Save the trained Random Forest model to a file
model_filename = 'random_forest_fuel_prediction_model.joblib'
joblib.dump(rf_fuel, model_filename)

print(f"Model saved to {model_filename}")


Model saved to random_forest_fuel_prediction_model.joblib


In [31]:
# Load the model from the file
loaded_model = joblib.load(model_filename)

# Use the loaded model to make predictions
loaded_y_pred_fuel = loaded_model.predict(X_test_fuel)

# Check the accuracy of the loaded model
loaded_accuracy = accuracy_score(y_test_fuel, loaded_y_pred_fuel)
print(f"Loaded model accuracy: {loaded_accuracy:.2f}")


Loaded model accuracy: 0.88


In [32]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf_fuel, 'random_forest_fuel_model.pkl')

print("Model saved successfully!")


Model saved successfully!


In [33]:
# Load the model
loaded_model = joblib.load('random_forest_fuel_model.pkl')

# You can use the loaded model to make predictions
y_pred_loaded = loaded_model.predict(X_test_fuel)
print("Predictions from the loaded model:", y_pred_loaded)


Predictions from the loaded model: [7 1 0 7 3 1 3 7 3 6 1 3 3 1 1 3 1 3 1 6 7 7 7 7 6 7 7 7 7 1 3 3 1 1 1 3 5
 1 2 1 0 6 3 1 1 1 3 3 7 3 3 7 3 1 6 3 3 1 3 6 6 3 1 3 7 3 7 3 3 3 1 3 0 7
 3 1 6 1 1 3 3 3 3 3 1 3 3 6 6 1 3 3 2 7 3 3 6 0 1 3 7 3 1 7 2 6 3 7 1 7 1
 1 3 3 1 0 1 6 1 6 1 0 3 1 1 1 6 1 3 1 1 3 3 3 1 7 3 2 1 3 3 6 3 1 1 6 3 0
 6 1 1 7 3 6 1 6 7 1 3 1 1 1 3 7 3 6 6 6 1 1 2 3 6 1 1 3 1 6 3 1 7 3]
