1. Create a derived table called yearly_mean_temperature that contains the yearly temperature averages for all weather stations from the mean_temperature table. It should contain staid, yearly_temp and year as columns.

2. Bucketize tg values in the mean_temperature table. Use CASE to to return a column that will hold the value hot when the temperature is above 25 degrees, normal when between 10 and 25 and cold when under 10.

3. Using GROUP BY and a subquery show the yearly average of the maximum temperatures of all stations in the mean_temperature table.

In [1]:
import pandas as pd
from sqlalchemy import create_engine, text

In [2]:
from dotenv import dotenv_values

config = dotenv_values(".env")

In [3]:
# define variables for the login
username = config['USER']
password = config['PASS']
host = config['HOST']
port = config['PORT']

In [4]:
url = f'postgresql://{username}:{password}@{host}:{port}/climate'

In [5]:
engine = create_engine(url, echo=False)

#### Create a derived table called yearly_mean_temperature that contains the yearly temperature averages for all weather stations from the mean_temperature table. It should contain staid, yearly_temp and year as columns.

In [6]:
with engine.begin() as conn:
    conn.execute(text("""
    DROP TABLE IF EXISTS yearly_mean_temperature;
    CREATE TABLE yearly_mean_temperature AS
    SELECT staid , 
    AVG(tg) AS yearly_temp, 
    EXTRACT(YEAR FROM date) AS year
    FROM mean_temperature
    GROUP BY staid , year  
    """))
    result = conn.execute(text("SELECT * FROM yearly_mean_temperature"))
    data = result.fetchall()
    df_yearly_mean_temp = pd.DataFrame(data)

In [7]:
df_yearly_mean_temp

Unnamed: 0,staid,yearly_temp,year
0,1,0E-20,1860
1,1,0E-20,1862
2,1,0E-20,1864
3,1,0E-20,1865
4,1,0E-20,1866
...,...,...,...
250574,18317,-4.9726027397260274,2019
250575,18317,6.0737704918032787,2020
250576,18317,-6.6821917808219178,2021
250577,18317,1.9780821917808219,2022


#### Bucketize tg values in the mean_temperature table. Use CASE to to return a column that will hold the value hot when the temperature is above 25 degrees, normal when between 10 and 25 and cold when under 10.

In [8]:
with engine.begin() as conn:
    result= conn.execute(text("""
    SELECT staid, tg,
        CASE
            WHEN tg > 25 THEN 'Hot'
            WHEN tg BETWEEN 10 AND 25 THEN 'normal'
            ELSE 'cold'
        END AS temp_bucket
    FROM mean_temperature
    GROUP BY staid, tg ;
    
    """))
    data = result.all()
    df_temp_bucket = pd.DataFrame(data)

In [9]:
df_temp_bucket

Unnamed: 0,staid,tg,temp_bucket
0,11061,255,Hot
1,4409,19,normal
2,391,283,Hot
3,405,-255,cold
4,1410,231,Hot
...,...,...,...
2009458,8855,141,Hot
2009459,4291,-151,cold
2009460,18254,-74,cold
2009461,7996,-161,cold


#### Using GROUP BY and a subquery show the yearly average of the maximum temperatures of all stations in the mean_temperature table.

In [10]:
# The outer query selects from the result of the inner query. #for count

with engine.begin() as conn:
    result=conn.execute(text("""
    
    SELECT bucket_data.bucket_temp, count(bucket_data.*)
    
        FROM (
                SELECT staid, tg,
                            CASE
                                WHEN tg > 25 THEN 'Hot'
                                WHEN tg BETWEEN 10 AND 25 THEN 'normal'
                                ELSE 'cold'
                            END AS bucket_temp
                FROM mean_temperature
                GROUP BY staid, tg ) bucket_data
                
        GROUP BY bucket_temp;
        
        """))
    data = result.all()
    
pd.DataFrame(data) 

Unnamed: 0,bucket_temp,count
0,Hot,1043810
1,cold,894500
2,normal,71153


In [11]:
with engine.begin() as conn:
    result=conn.execute(text("""
    
    SELECT  year, 
    AVG(max_temp) AS avg_max_temp
    FROM (
            SELECT EXTRACT(YEAR FROM date) AS year, 
            MAX(tg) AS max_temp
            FROM mean_temperature
            GROUP BY EXTRACT(YEAR FROM date), staid 
            ) AS  subquery
                
        GROUP BY year
        ORDER BY avg_max_temp DESC;
        
        """))
    data = result.fetchall()
    
pd.DataFrame(data)

Unnamed: 0,year,avg_max_temp
0,1772,333.0000000000000000
1,1787,287.0000000000000000
2,1811,284.0000000000000000
3,1808,280.0000000000000000
4,1770,270.0000000000000000
...,...,...
261,1762,0E-20
262,1760,0E-20
263,1759,0E-20
264,1757,0E-20
