In [2]:
# Setup
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import ipywidgets as widgets
import cartopy, os, re

from filter import code_keydic
from IPython.display import display

# Global Climate Change

## Background
Climate Change is defined as the long-term changes in Earth's climate and weather patterns and Climate is often defined as the average weather at a particular place or as the mean state and variablility of features such as temperature, precipitiation, and humidity over some extended time period. Shifts in the climate can be natural, such as changes in the sun's activity or it can be anthropogenic. The extent of the current climate change can laragly be attributed to human actvity such as the burning of fossil fuels like coal, oil, and gas. The burning of fossil fuels generates greenhouse gas which traps the sun's heat and therefore rasies the temperature. The average tempertaure of the Earth's surface is now about 1.1 C warm than it was before the indusrial revolution. Climate change means more than just warmer temperatures, it also includes, among others, intense droughts, severe fire, rising sea level, and more intense storms.

## Goals
The goal of this project is to examine the temperature and precipitation aspects of climate data to understand global and regional trends and the realtion between climate varaibles and greenhouse gas emissions. Futhermore, it aims to model potential future scenarios based on current trends over differnt time horizen.
    
1. Are there discerbinle trends in temperature and precipitation over the given time period, globally and across different regions/countries?
2. Are there regions/countries that are experiencing more drastic changes compared to others?
3. Are there any extreme weather events that show a significant increase in frequency or itensity over time
4. Are there any discerbinle trends in temperature and precipitation that differs between developed and developing countries?
5. Can the data be compared with greenhouse gas emissions data to assess the realtionship between emissions and climate varaibles?
6. If the current trends shown in the data continues without any significant change, where would we be in 10/50/100 years from now?


# Data Gathering
## Dataset
The main dataset used in this project is the Global Historical Climatology Network - Daily (GHCN-Daily), Version 3 sourced from the National Oceanic and Atmospheric Administration. The dataset is available for public use , with the only limitations being that the "*NOAA and NCEI cannot provide any warranty as to the accuracy, reliability, or completeness of furnished data. Users assume responsibility to determine the usability of these data. The user is responsible for the results of any application of this data for other than its intended purpose.*"

### Dataset Metadata
File Size - 131 GB  <br>
Number of Files - 125, 391

#### Measurements:
1. Precipitations (tenths of mm, PRCP)
2. Temperature Max (tenths of degrees C, TMAX)
3. Temperature Min (tenths of degrees C, TMIN)
4. Temperature Average (tenths of degrees C, TAXN)
5. Snowfall (mm, SNOW)
6. Snow Depth (mm SNWD)

### Data Range
>Most Frequent Start Date: 1901-01-01      *(count: 1715)* <br>
>Most Frequent End Date: 2024-02-08    *(count: 11137)*
>
>
>Date Range: 1901-01-01 ~ 2023-12-31

# Data Cleaning

## Filtering Out Irrelevant and Empty Columns

In [None]:

def filter_csv(input_file, columns):
    df_in = pd.read_csv(input_file, dtype=str)
    df_columns = df_in.columns.to_list()

    valid_col = [col for col in columns if col in df_columns]

    #Filter the DataFrame based on columns
    filtered_df = df_in[valid_col]

    return filtered_df

def check_empty_columns(df):
    #Filters out the empty columns
    filtered_df = df.dropna(axis=1, how="all")

    return filtered_df


def filter_columns():
    #columns names in csv file
    col = ["TMAX", 'TMIN', 'TAXN', 'PRCP', 'SNOW', 'SNWD', 'DATE', 'STATION']

    input_loc = r'Data\\'
    output_loc = r'Filtered Data'

    #filter file in dir to perfect & other folder based contents
    for item in os.listdir(input_loc):
        df = filter_csv(input_loc+ item, col)
        
        filter_df = check_empty_columns(df)

        filter_df.to_csv(os.path.join(output_loc, item), index=False)


## Filtering Out Files Without Requisite Data

In [None]:
def filter_columns(df):
    temp_col = ["TMAX", "TMIN"]
    avgTemp_col = ["TAXN"]
    other_col = ["PRCP", 'SNOW', 'SNWD']

    df_col = df.columns.to_list()

    temp_cond = all(col in df_col for col in temp_col)
    avgTemp_cond = all(col in df_col for col in avgTemp_col)

    if temp_cond or avgTemp_cond:
        if all(col in df_col for col in other_col): return 'pt' #add to perfect data
        else: return 'tp' #add to temp data
    else: return 'pr' #only precipitation data

def run_filter():
    #columns names in csv file
    col = ["TMAX", 'TMIN', 'TAXN', 'PRCP', 'SNOW', 'SNWD', 'DATE', 'STATION']

    input_loc = r'Data\\'
    output_loc = r'Filtered Data'

    #filter file in dir to perfect & other folder based contents
    for item in os.listdir(input_loc):
        filter_df = pd.read_csv(input_loc+item, dtype=str)

        key = filter_columns(filter_df)

        # check the key to see which folder to put csv file in Filtered Data
        if key == 'pt': filter_df.to_csv(os.path.join(output_loc, 'Perfect', item), index=False)
        if key == 'tp': filter_df.to_csv(os.path.join(output_loc, 'Temperature', item), index=False)
        if key == 'pr': filter_df.to_csv(os.path.join(output_loc, 'Precipitation', item), index=False)

## Filtering out Data Over Defined Threshold

In [None]:
def code_keydic():
    code_loc = r'Other\\ghcnd-countries.txt'
    code_key = {}
        
    with open(code_loc) as f:
        data = f.readlines()
        
    for line in data:
        code, country = line.split(' ', 1)

        if '[' in country:
            country = re.sub(r'\[[^\]]*\]', '', country)
            country = re.sub(r'\s{2,}', ' ', country)
        country = country.strip()

        code_key[code] = country

    return code_key

def code_to_country(txt, key):
    code = txt[:2]
    return key[code]

def filter_threshold():
    data_loc = r'Filtered Data\\Perfect'
    output_loc = r'Filtered Data\\Perfect Threshold'
    thershold_datapath = r'Other\\Table_Extreme_Records_Hemisphere.csv'
    
    #countries in each hemisphere
    with open(r'Other\both_hemisphere.txt') as f:
        whole = [line.strip() for line in f.readlines()]
    with open(r'Other\northern_hemisphere.txt') as f:
        nth_hs = [line.strip() for line in f.readlines()]
    with open(r'Other\southern_hemisphere.txt') as f:
        sth_hs = [line.strip() for line in f.readlines()]
    
    #country code dic
    countrycode_dic = code_keydic()
    
    #threshold dataframe
    thres_df = pd.read_csv(thershold_datapath)
    tempMx, tempMn, prcp = [], [], []
    
    for index, row in thres_df.iterrows():
        if row['Characteristic'] == 'TMAX':
            if row['Hemisphere'] == 'Northern': tempMx.append(row['Value'])
            if row['Hemisphere'] == 'Southern': tempMx.append(row['Value'])
        elif row['Characteristic'] == 'TMIN':
            if row['Hemisphere'] == 'Northern': tempMn.append(row['Value'])
            if row['Hemisphere'] == 'Southern': tempMn.append(row['Value'])
        elif row['Characteristic'] == 'PRCP':
            if row['Hemisphere'] == 'Northern': prcp.append(row['Value'])
            if row['Hemisphere'] == 'Southern': prcp.append(row['Value'])
    
    for item in os.listdir(data_loc):
        code = item[:2]
        country_name = code_to_country(code, countrycode_dic)

        orgi_loc = os.path.join(data_loc, item)
        df = pd.read_csv(orgi_loc)
        
        # check which hemisphere the country is in
        if country_name in whole:
            if df['TMAX'].max()/10 > max(tempMx): continue
            if df['TMIN'].min()/10 < min(tempMn): continue
            if df['PRCP'].max()/10 > max(prcp): continue
            
            #move files after data is validated
            new_loc = os.path.join(output_loc, item)
            os.replace(orgi_loc, new_loc)

        elif country_name in nth_hs:
            if df['TMAX'].max()/10 > tempMx[0]: continue
            if df['TMIN'].min()/10 < tempMn[0]: continue
            if df['PRCP'].max()/10 > prcp[0]: continue

            #move files after data is validated
            new_loc = os.path.join(output_loc, item)
            os.replace(orgi_loc, new_loc)

        elif country_name in sth_hs:
            if df['TMAX'].max()/10 > tempMx[1]: continue
            if df['TMIN'].min()/10 < tempMn[1]: continue
            if df['PRCP'].max()/10 > prcp[1]: continue

            #move files after data is validated
            new_loc = os.path.join(output_loc, item)
            os.replace(orgi_loc, new_loc)

## Handling Missing Data

In [None]:
def handle_missing_data():
    data_loc = r'Filtered Example Data\\Perfect Threshold'
    output_loc = r'Filtered Example Data\\Ready Data'

    for item in os.listdir(data_loc):
        orgi_loc = os.path.join(data_loc, item)
        new_loc = os.path.join(output_loc, item)

        df =  pd.read_csv(orgi_loc)

        count = 0
        for col in df.columns:
            # nan percentage is the percentage of missing data
            nan_percentage = (df[col].isna().sum() / len(df)) * 100
            if nan_percentage > 50: count += 1

        if count == 0:  os.replace(orgi_loc, new_loc)

## Data Aggregation (Temporal and Spatial)

In [None]:
def find_first_last_occurrence(lst):
    occurrences = {}
    current, first = None, None

    for idx, item in enumerate(lst):
        if item[:2] != current:
            if current is not None: occurrences[current] = (first, idx)
            current = item[:2]
            first = idx
    
    if current is not None: occurrences[current] = (first, len(lst) - 1)

    return occurrences

def monthy_yearly_data():
    data_loc = r'Filtered Data\\Ready Data'
    output_loc =  r'Filtered Data\\Data Aggregation'
    
    lst_of_csv = os.listdir(data_loc)
    lst_of_csv.sort()
    country_code_dic = find_first_last_occurrence(lst_of_csv)

    for ccode in country_code_dic.keys():
        startIdx, endIdx = country_code_dic[ccode][0], country_code_dic[ccode][1]
        monthly_lst, yearly_lst = [], []

        for item in lst_of_csv[startIdx:endIdx]:
            df = pd.read_csv(os.path.join(data_loc, item))

            #create new year and month column from data column
            df['DATE'] = pd.to_datetime(df['DATE'])
            df['Year'] = df['DATE'].dt.year
            df['Month'] = df['DATE'].dt.month

            #calculate monthly and yearly average for each column
            col_to_include = df.columns[df.columns != 'STATION']
            monthly_avg = df.groupby(['Year', 'Month'])[col_to_include].mean()
            yearly_avg = df.groupby(['Year'])[col_to_include].mean()

            #append the results to the lists
            monthly_lst.append(monthly_avg)
            yearly_lst.append(yearly_avg)
        
        #concatenate all dataframe from all csv files that are in list
        country_monthly_avg = pd.concat(monthly_lst)
        country_yearly_avg = pd.concat(yearly_lst)

        #drop last two duplicate columns
        country_monthly_avg = country_monthly_avg.iloc[:, :-3]
        country_yearly_avg = country_yearly_avg.iloc[:, :-3]

        # get the mean for every year and mean for every month
        country_monthly_avg  = country_monthly_avg.groupby(['Year', 'Month']).mean()
        country_yearly_avg = country_yearly_avg.groupby(['Year']).mean()

        # divide value in dataframe by 10 to get accurate units
        columns_to_divide_10 = ['TMAX', 'TMIN', 'PRCP']
        
        country_monthly_avg[columns_to_divide_10] = country_monthly_avg[columns_to_divide_10]/10
        country_yearly_avg[columns_to_divide_10] = country_yearly_avg[columns_to_divide_10]/10

        # round all the data besides timestamp
        columns_to_round = ['TMAX', 'TMIN', 'PRCP', 'SNOW', 'SNWD']

        country_monthly_avg[columns_to_round] = country_monthly_avg[columns_to_round].round(4)
        country_yearly_avg[columns_to_round] = country_yearly_avg[columns_to_round].round(4)

        #save to csv file
        country_monthly_avg.to_csv(os.path.join(output_loc, f'monthly_avg_{ccode}.csv'))
        country_yearly_avg.to_csv(os.path.join(output_loc, f'yearly_avg_{ccode}.csv'))

# Data Exploration and Feature Engineering

### Trends
Are there discerbinle trends in temperature and precipitation over the given time period, globally and across different regions/countries?
Are there any extreme weather events that show a significant increase in frequency or itensity over time

In [23]:
# path to data
data_loc = r'Filtered Data\Data Aggregation'

# list to store DataFrame from each CSV file
dfs = {}

# loop through each file and add to list
for file in os.listdir(data_loc):
    if 'yearly' in file:
        key_dic = code_keydic()
        country, ext = file.split('.')
        country_name =  key_dic[country[-2:]]

        file_path = os.path.join(data_loc, file)
        df = pd.read_csv(file_path)
        dfs[country_name] = df

# create traces for each line
year_value = list(range(1900, 2024))
tmax_traces, tmin_traces, precp_traces, snow_traces, snowD_traces= [], [], [], [], []

for country_name, df in dfs.items():
    trace = go.Scatter(x=year_value, y=df['TMAX'], mode='lines', name=country_name)
    tmax_traces.append(trace)

    trace = go.Scatter(x=year_value, y=df['TMIN'], mode='lines', name=country_name)
    tmin_traces.append(trace)

    trace = go.Scatter(x=year_value, y=df['PRCP'], mode='lines', name=country_name)
    precp_traces.append(trace)

    trace = go.Scatter(x=year_value, y=df['SNOW'], mode='lines', name=country_name)
    snow_traces.append(trace)

    trace = go.Scatter(x=year_value, y=df['SNWD'], mode='lines', name=country_name)
    snowD_traces.append(trace)

# Define the initial data to be displayed
initial_traces = tmax_traces + tmin_traces + precp_traces + snow_traces + snowD_traces

# Define the layout with dropdown menu
graph_layout = go.Layout(title='Temperature Country Line Graph',
                         xaxis=dict(title='Year'),
                         yaxis=dict(title='Maximum Temperature in degree °C'),
                         updatemenus=[
                             dict(
                                 buttons=list([
                                     dict(label='Maximum Temperature',
                                          method='update',
                                          args=[{'visible': [True] * len(tmax_traces) + [False] * (len(initial_traces) - len(tmax_traces))},
                                                {'title': 'Maximum Temperature',
                                                 'yaxis': {'title': 'Maximum Temperature in degree °C'}}]),
                                     dict(label='Minimum Temperature',
                                          method='update',
                                          args=[{'visible': [False] * len(tmax_traces) + [True] * len(tmin_traces) + [False] * (len(initial_traces) - len(tmax_traces) - len(tmin_traces))},
                                                {'title': 'Minimum Temperature',
                                                 'yaxis': {'title': 'Minimum Temperature in degree °C'}}]),
                                     dict(label='Precipitation',
                                          method='update',
                                          args=[{'visible': [False] * (len(tmax_traces) + len(tmin_traces)) + [True] * len(precp_traces) + [False] * (len(initial_traces) - len(tmax_traces) - len(tmin_traces) - len(precp_traces))},
                                                {'title': 'Precipitation',
                                                 'yaxis': {'title': 'Precipitation in mm'}}]),
                                     dict(label='Snowfall',
                                          method='update',
                                          args=[{'visible': [False] * (len(tmax_traces) + len(tmin_traces) + len(precp_traces)) + [True] * len(snow_traces) + [False] * (len(initial_traces) - len(tmax_traces) - len(tmin_traces) - len(precp_traces) - len(snow_traces))},
                                                {'title': 'Snowfall',
                                                 'yaxis': {'title': 'Snowfall in mm'}}]),
                                     dict(label='Snow Depth',
                                          method='update',
                                          args=[{'visible': [False] * (len(tmax_traces) + len(tmin_traces) + len(precp_traces) + len(snow_traces)) + [True] * len(snowD_traces)},
                                                {'title': 'Snow Depth',
                                                 'yaxis': {'title': 'Snow Depth in mm'}}]),
                                 ]),
                                 direction='down',
                                 pad={'r': 10, 't': 10},
                                 showactive=True,
                                 x=0,
                                 xanchor='left',
                                 y=1.2,
                                 yanchor='top'
                             ),
                         ])

# Create the figure
fig = go.Figure(data=initial_traces, layout=graph_layout)

# Show the figure
fig.show()

### Regional Trend Comparison
Are there regions/countries that are experiencing more drastic changes compared to others?
Are there any discerbinle trends in temperature and precipitation that differs between developed and developing countries?

### Data Comparison
Can the data be compared with greenhouse gas emissions data to assess the realtionship between emissions and climate varaibles?

# Model Development

If the current trends shown in the data continues without any significant change, where would we be in 10/50/100 years from now?

# Model Evaluation and Selection

# Conclusion