In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.impute import SimpleImputer

In [2]:

power_plants = pd.read_csv("https://github.com/wri/global-power-plant-database/raw/master/source_databases_csv/database_IND.csv")

In [3]:
power_plants

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,year_of_capacity_data,generation_gwh_2013,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_gwh_2019,generation_data_source,estimated_generation_gwh
0,IND,India,ACME Solar Tower,WRI1020239,2.5,28.1839,73.2407,Solar,,,...,,,,,,,,,,
1,IND,India,ADITYA CEMENT WORKS,WRI1019881,98.0,24.7663,74.6090,Coal,,,...,,,,,,,,,,
2,IND,India,AES Saurashtra Windfarms,WRI1026669,39.2,21.9038,69.3732,Wind,,,...,,,,,,,,,,
3,IND,India,AGARTALA GT,IND0000001,135.0,23.8712,91.3602,Gas,,,...,2019.0,,617.789264,843.747000,886.004428,663.774500,626.239128,,Central Electricity Authority,
4,IND,India,AKALTARA TPP,IND0000002,1800.0,21.9603,82.4091,Coal,Oil,,...,2019.0,,3035.550000,5916.370000,6243.000000,5385.579736,7279.000000,,Central Electricity Authority,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,IND,India,YERMARUS TPP,IND0000513,1600.0,16.2949,77.3568,Coal,Oil,,...,2019.0,,,0.994875,233.596650,865.400000,686.500000,,Central Electricity Authority,
903,IND,India,Yelesandra Solar Power Plant,WRI1026222,3.0,12.8932,78.1654,Solar,,,...,,,,,,,,,,
904,IND,India,Yelisirur wind power project,WRI1026776,25.5,15.2758,75.5811,Wind,,,...,,,,,,,,,,
905,IND,India,ZAWAR MINES,WRI1019901,80.0,24.3500,73.7477,Coal,,,...,,,,,,,,,,


In [4]:
power_plants.dropna(subset=['primary_fuel', 'capacity_mw'], inplace=True)

In [5]:
X = power_plants[['country_long', 'name', 'gppd_idnr', 'latitude', 'longitude', 'commissioning_year', 'owner', 'source']]
y_fuel = power_plants['primary_fuel']
y_capacity = power_plants['capacity_mw']

In [6]:
X_encoded = pd.get_dummies(X)

In [7]:
X_train_fuel, X_test_fuel, y_train_fuel, y_test_fuel = train_test_split(X_encoded, y_fuel, test_size=0.2, random_state=42)

In [8]:
X_train_cap, X_test_cap, y_train_cap, y_test_cap = train_test_split(X_encoded, y_capacity, test_size=0.2, random_state=42)

In [11]:
imputer = SimpleImputer(strategy='mean')  
X_train_fuel_imputed = imputer.fit_transform(X_train_fuel)

In [12]:
X_train_fuel.dropna(inplace=True)
y_train_fuel = y_train_fuel[X_train_fuel.index]

In [13]:
fuel_model = RandomForestClassifier(n_estimators=100, random_state=42)
fuel_model.fit(X_train_fuel, y_train_fuel)

In [15]:
imputer = SimpleImputer(strategy='mean')
X_train_cap_imputed = imputer.fit_transform(X_train_cap)

In [16]:
X_train_cap.dropna(inplace=True)
y_train_cap = y_train_cap[X_train_cap.index] 

In [17]:
capacity_model = RandomForestRegressor(n_estimators=100, random_state=42)
capacity_model.fit(X_train_cap, y_train_cap)

In [19]:
imputer = SimpleImputer(strategy='mean')
X_test_fuel_imputed = imputer.fit_transform(X_test_fuel)

In [20]:
X_test_fuel.dropna(inplace=True)
y_test_fuel = y_test_fuel[X_test_fuel.index]

In [22]:
y_pred_fuel = fuel_model.predict(X_test_fuel)
accuracy = accuracy_score(y_test_fuel, y_pred_fuel)
print("Accuracy:", accuracy)

Accuracy: 0.6818181818181818


In [24]:
imputer = SimpleImputer(strategy='mean') 
X_test_cap_imputed = imputer.fit_transform(X_test_cap)

In [25]:
X_test_cap.dropna(inplace=True)
y_test_cap = y_test_cap[X_test_cap.index] 

In [26]:
y_pred_cap = capacity_model.predict(X_test_cap)
mse = mean_squared_error(y_test_cap, y_pred_cap)
print("Mean Squared Error:", mse)

Mean Squared Error: 418261.45195159945
