In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error


In [2]:
# Load the datasets
urls = [
    'https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Global%20Power%20Plant%20Database/database_AUS.csv',
    'https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Global%20Power%20Plant%20Database/database_IND.csv',
    'https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Global%20Power%20Plant%20Database/database_USA.csv'
]

In [3]:
# Combine the datasets into one DataFrame
df_list = [pd.read_csv(url) for url in urls]
df = pd.concat(df_list, ignore_index=True)


In [4]:
# Drop rows with missing values in important columns
df.dropna(subset=['primary_fuel', 'capacity_mw'], inplace=True)

In [5]:
# Encode categorical features
le_country = LabelEncoder()
df['country'] = le_country.fit_transform(df['country'])

le_primary_fuel = LabelEncoder()
df['primary_fuel'] = le_primary_fuel.fit_transform(df['primary_fuel'])

In [6]:
# Prepare features and target variables
X = df[['country', 'latitude', 'longitude', 'commissioning_year']]
y_fuel = df['primary_fuel']
y_capacity = df['capacity_mw']

In [7]:
# Drop rows with NaNs in X
X = X.dropna()

In [8]:
# Adjust y_fuel and y_capacity to match X
y_fuel = y_fuel[X.index]
y_capacity = y_capacity[X.index]

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_fuel_train, y_fuel_test, y_capacity_train, y_capacity_test = train_test_split(X, y_fuel, y_capacity, test_size=0.2, random_state=42)

In [10]:
# Train a RandomForestClassifier for primary fuel prediction
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_fuel_train)
y_fuel_pred = rf_clf.predict(X_test)


In [11]:
# Evaluate the classifier
accuracy = accuracy_score(y_fuel_test, y_fuel_pred)
print(f'Primary Fuel Prediction Accuracy: {accuracy * 100:.2f}%')

Primary Fuel Prediction Accuracy: 71.64%


In [12]:
# Train a RandomForestRegressor for capacity prediction
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_capacity_train)
y_capacity_pred = rf_reg.predict(X_test)

In [13]:
# Evaluate the regressor
mse = mean_squared_error(y_capacity_test, y_capacity_pred)
print(f'Capacity Prediction Mean Squared Error: {mse:.2f}')

# To make predictions on new data
new_data = pd.DataFrame({
    'country': ['USA'],
    'latitude': [34.0522],
    'longitude': [-118.2437],
    'commissioning_year': [2020]
})

Capacity Prediction Mean Squared Error: 84887.48


In [None]:
new_data['country'] = le_country.transform(new_data['country'])
predicted_fuel = rf_clf.predict(new_data)
predicted_capacity = rf_reg.predict(new_data)


In [15]:
print(f'Predicted Primary Fuel: {le_primary_fuel.inverse_transform(predicted_fuel)}')
print(f'Predicted Capacity (MW): {predicted_capacity[0]:.2f}')

Predicted Primary Fuel: ['Solar']
Predicted Capacity (MW): 8.90
