In [None]:
import pandas as pd
import numpy as np
data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

energy_df = pd.DataFrame(data)
print(energy_df)

cleaned_energy_df = energy_df.dropna()
print(cleaned_energy_df)


print("Original DF : ")
energy_df.head()

  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                       NaN             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
4       Biomass                    2500.0             250.0
Original DF : 


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0


In [None]:
ec_mean = energy_df["Energy Consumption (MWh)"].mean()
#energy_df["Energy Consumption (MWh)"].fillna(ec_mean, inplace=True)
print(ec_mean)

cost_mean = energy_df["Cost (Million $)"].mean()
#energy_df["Cost (Million $)"].fillna(cost_mean, inplace=True)
print(cost_mean)

2450.0
250.0


In [None]:
energy_df["Energy Consumption (MWh)"].fillna(ec_mean, inplace=True)
energy_df["Cost (Million $)"].fillna(cost_mean, inplace=True)
energy_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Energy Consumption (MWh)"].fillna(ec_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Cost (Million $)"].fillna(cost_mean, inplace=True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [None]:
forward_filled_df=energy_df.fillna(method="ffill")
forward_filled_df.head()

  forward_filled_df=energy_df.fillna(method="ffill")


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [None]:
# Create a flag column indicating missing values in 'Energy Consumption (MWh)'
energy_df["Missing Consumption"] = energy_df["Energy Consumption (MWh)"].isna().astype(int)
energy_df["Missing Cost"] = energy_df["Cost (Million $)"].isna().astype(int)

print("\nData with Missing Values Flagged:")
energy_df.head()


Data with Missing Values Flagged:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $),Missing Consumption,Missing Cost
0,Solar,1200.0,200.0,0,0
1,Wind,2450.0,400.0,0,0
2,Hydropower,2900.0,250.0,0,0
3,Geothermal,2450.0,150.0,0,0
4,Biomass,2500.0,250.0,0,0


In [None]:
#min max scaling
# Get the 'Energy Consumption (MWh)' column
energy_consumption = energy_df['Energy Consumption (MWh)']

# Calculate the minimum and maximum values
min_value = energy_consumption.min()
max_value = energy_consumption.max()

# Apply the Min-Max scaling formula
scaled_data_minmax = (energy_consumption - min_value) / (max_value - min_value)

# Add the scaled data as a new column to the DataFrame
energy_df['Scaled Energy Consumption (MinMax)'] = scaled_data_minmax



In [None]:
#standard scaling
# Get the 'Energy Consumption (MWh)' column
energy_consumption = energy_df['Energy Consumption (MWh)']

# Calculate the mean and standard deviation
mean = energy_consumption.mean()
std_dev = energy_consumption.std()

# Apply the Standard scaling formula
scaled_data_standard = (energy_consumption - mean) / std_dev

# Add the scaled data as a new column to the DataFrame
energy_df['Scaled Energy Consumption (Standard)'] = scaled_data_standard

In [None]:
# Display the original and scaled data for 'Energy Consumption (MWh)'
print("Original Data:")
print(energy_df[['Energy Source', 'Energy Consumption (MWh)']])

print("\nData with Min-Max Scaling:")
print(energy_df[['Energy Source', 'Scaled Energy Consumption (MinMax)']])

print("\nData with Standard Scaling:")
print(energy_df[['Energy Source', 'Scaled Energy Consumption (Standard)']])

Original Data:
  Energy Source  Energy Consumption (MWh)
0         Solar                    1200.0
1          Wind                    2450.0
2    Hydropower                    2900.0
3    Geothermal                    2450.0
4       Biomass                    2500.0
5       Nuclear                    3200.0

Data with Min-Max Scaling:
  Energy Source  Scaled Energy Consumption (MinMax)
0         Solar                               0.000
1          Wind                               0.625
2    Hydropower                               0.850
3    Geothermal                               0.625
4       Biomass                               0.650
5       Nuclear                               1.000

Data with Standard Scaling:
  Energy Source  Scaled Energy Consumption (Standard)
0         Solar                             -1.831121
1          Wind                              0.000000
2    Hydropower                              0.659204
3    Geothermal                              0.000000


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assuming your DataFrame is named 'energy_df' and the column to scale is 'Energy Consumption (MWh)'
# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to your data
scaler.fit(energy_df[['Energy Consumption (MWh)']])  # Use double brackets to select the column as a DataFrame

# Transform the data
scaled_data = scaler.transform(energy_df[['Energy Consumption (MWh)']])

# Add the scaled data as a new column to the DataFrame
energy_df['Scaled Energy Consumption (MinMax)'] = scaled_data

# Display the scaled data
print(energy_df[['Energy Source', 'Scaled Energy Consumption (MinMax)']])

  Energy Source  Scaled Energy Consumption (MinMax)
0         Solar                               0.000
1          Wind                               0.625
2    Hydropower                               0.850
3    Geothermal                               0.625
4       Biomass                               0.650
5       Nuclear                               1.000


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(energy_df[['Energy Consumption (MWh)']])
energy_df['Scaled Energy Consumption (Standard)'] = scaled_data
print(energy_df[['Energy Source', 'Scaled Energy Consumption (Standard)']])

  Energy Source  Scaled Energy Consumption (Standard)
0         Solar                             -2.005893
1          Wind                              0.000000
2    Hydropower                              0.722121
3    Geothermal                              0.000000
4       Biomass                              0.080236
5       Nuclear                              1.203536
