Pandas

In [2]:
import pandas as pd

# Sample renewable energy sources data
renewable_sources = ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass"]

# Sample green technology project data (for DataFrame)
data = {
    "Project": ["Solar Farm A", "Wind Turbine X", "Hydropower Y", "Solar Roof Z", "Geothermal Plant P"],
    "Technology": ["Solar", "Wind", "Hydropower", "Solar", "Geothermal"],
    "Capacity (MW)": [150, 300, 200, 50, 100],  # Megawatts
    "Cost (Million $)": [200, 400, 350, 100, 250],  # Project cost
    "Location": ["California", "Texas", "Washington", "Nevada", "Idaho"],
    "Completion Year": [2023, 2024, 2022, 2025, 2023]
}

renewable_series = pd.Series(renewable_sources)

print("Renewable Energy Sources:")
print(renewable_series)

Renewable Energy Sources:
0         Solar
1          Wind
2    Hydropower
3    Geothermal
4       Biomass
dtype: object


In [9]:
project_df = pd.DataFrame(data)

print("\nGreen Technology Projects DataFrame:")
# In Pandas, .head() is a method used to display the first few rows of a DataFrame
project_df.head()


Green Technology Projects DataFrame:


Unnamed: 0,Project,Technology,Capacity (MW),Cost (Million $),Location,Completion Year
0,Solar Farm A,Solar,150,200,California,2023
1,Wind Turbine X,Wind,300,400,Texas,2024
2,Hydropower Y,Hydropower,200,350,Washington,2022
3,Solar Roof Z,Solar,50,100,Nevada,2025
4,Geothermal Plant P,Geothermal,100,250,Idaho,2023


In [11]:
# Shows the tail of the table
project_df.tail(3)

Unnamed: 0,Project,Technology,Capacity (MW),Cost (Million $),Location,Completion Year
2,Hydropower Y,Hydropower,200,350,Washington,2022
3,Solar Roof Z,Solar,50,100,Nevada,2025
4,Geothermal Plant P,Geothermal,100,250,Idaho,2023


In [14]:
# Slicing: shows the middle values or the sliced values
project_df[2:4]

Unnamed: 0,Project,Technology,Capacity (MW),Cost (Million $),Location,Completion Year
2,Hydropower Y,Hydropower,200,350,Washington,2022
3,Solar Roof Z,Solar,50,100,Nevada,2025


In [18]:
# Data types from the given dataframe
project_df.dtypes

Project             object
Technology          object
Capacity (MW)        int64
Cost (Million $)     int64
Location            object
Completion Year      int64
dtype: object

In [20]:
# Rows and columns numbers
project_df.shape

(5, 6)

In [22]:
# Columns
project_df.columns

Index(['Project', 'Technology', 'Capacity (MW)', 'Cost (Million $)',
       'Location', 'Completion Year'],
      dtype='object')

In [24]:
project_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Project           5 non-null      object
 1   Technology        5 non-null      object
 2   Capacity (MW)     5 non-null      int64 
 3   Cost (Million $)  5 non-null      int64 
 4   Location          5 non-null      object
 5   Completion Year   5 non-null      int64 
dtypes: int64(3), object(3)
memory usage: 372.0+ bytes


In [26]:
project_df.describe()

Unnamed: 0,Capacity (MW),Cost (Million $),Completion Year
count,5.0,5.0,5.0
mean,160.0,260.0,2023.4
std,96.17692,119.373364,1.140175
min,50.0,100.0,2022.0
25%,100.0,200.0,2023.0
50%,150.0,250.0,2023.0
75%,200.0,350.0,2024.0
max,300.0,400.0,2025.0


In [30]:
type(project_df["Project"])

pandas.core.series.Series

In [44]:
project_df[["Project", 'Capacity (MW)']]

Unnamed: 0,Project,Capacity (MW)
0,Solar Farm A,150
1,Wind Turbine X,300
2,Hydropower Y,200
3,Solar Roof Z,50
4,Geothermal Plant P,100


In [45]:
project_df.iloc[:3, [1,4]]

Unnamed: 0,Technology,Location
0,Solar,California
1,Wind,Texas
2,Hydropower,Washington


In [41]:
# filter projects with capacity greather than 100 MW
high_capacity_project = project_df[project_df["Capacity (MW)"] > 100]

print("\nProject with Capacity Greater than 100 MW:")
print(high_capacity_project)


Project with Capacity Greater than 100 MW:
          Project  Technology  Capacity (MW)  Cost (Million $)    Location  \
0    Solar Farm A       Solar            150               200  California   
1  Wind Turbine X        Wind            300               400       Texas   
2    Hydropower Y  Hydropower            200               350  Washington   

   Completion Year  
0             2023  
1             2024  
2             2022  


In [42]:
# Add a new column for cost per MW
# Also called feature engineering
project_df["Cost per MW"] = project_df["Cost (Million $)"] / project_df["Capacity (MW)"]

print("\nDataFrame with Cost per MW:")
project_df.head()


DataFrame with Cost per MW:


Unnamed: 0,Project,Technology,Capacity (MW),Cost (Million $),Location,Completion Year,Cost per MW
0,Solar Farm A,Solar,150,200,California,2023,1.333333
1,Wind Turbine X,Wind,300,400,Texas,2024,1.333333
2,Hydropower Y,Hydropower,200,350,Washington,2022,1.75
3,Solar Roof Z,Solar,50,100,Nevada,2025,2.0
4,Geothermal Plant P,Geothermal,100,250,Idaho,2023,2.5


In [47]:
# To check null values if exists
project_df.isnull().sum()

Project             0
Technology          0
Capacity (MW)       0
Cost (Million $)    0
Location            0
Completion Year     0
Cost per MW         0
dtype: int64

Grouping data:

In [49]:
# Aggregate the total capacity and cost
total_capacity = project_df["Capacity (MW)"].sum()
total_cost = project_df["Cost (Million $)"].sum()

print(f"\nTotal Capacity of all projects: {total_capacity} MW")
print(f"Total Cost of all projects: ${total_cost} million")


Total Capacity of all projects: 800 MW
Total Cost of all projects: $1300 million


In [None]:
# Grouping the data
# Group by 'Technology' and calculate total capacity for each type
grouped_data = project_df.groupby("Technology")["Capacity (MW)"].sum()

print("\nTotal Capacity by Technology:")
print(grouped_data)


Total Capacity by Technology:
Technology
Geothermal    100
Hydropower    200
Solar         200
Wind          300
Name: Capacity (MW), dtype: int64
