In [1]:
import pandas as pd
import math
import numpy as np

pd.options.mode.copy_on_write = True
pd.set_option("future.no_silent_downcasting", True)
#start and end years to analyze
year_start=2020
year_end=2025
#list of all years as strings
year_list=list(map(str,range(year_start,year_end+1)))

def avg_yearly_thunder(station,station_index):
    """Returns list of thunder instances by year and total average thunder instances across all years for a single station"""
    global year_start
    global year_end
    global weather_df_dict
    global total_stations_list

    thunder_list=[]
    thunder_list_weight=[]
    for year in range(year_start,year_end+1):
        print(f'\rCalculating thunder data for station# {station_index+1}/{total_stations_list} in {year}           ', end = "", flush = True)
        #dataframe of all weather day datapoints of this station in each year
        station_data_in_year=weather_df_dict[year][weather_df_dict[year]['NAME']==station]
        #list of weights for weighted average based on the number of data points in the year
        thunder_list_weight.append(len(station_data_in_year))
        #if there are no datapoints, fill with "NA" for that year's datapoint
        if station_data_in_year.empty:
            thunder_list.append(pd.NA)
        else:
            #if there are datapoints, select all datapoints that reported thunder
            station_thunder_in_year=station_data_in_year[(station_data_in_year['FRSHTT']//10)%2==1]
            #sum all datapoints together to get a total of detected thunderstorms for that year
            thunder_list.append(len(station_thunder_in_year))
    
    #create list with all "NA" datapoints replaced by 0 for weighted calculation. These correspond to
        #a weight of 0 in thunder_List_weight, so the resulting weighted average will not be skewed
    thunder_list_no_na=pd.Series(thunder_list).fillna(0).tolist()
    #create total average of thunderstorms across all years 
    thunder_list.append(pd.Series(thunder_list).mean())
    #weighted average is calculated based on the number of days the station reported for each year
    thunder_list.append(sum(np.multiply(thunder_list_no_na,thunder_list_weight).tolist())/sum(thunder_list_weight))
    #print(f'{station} values: {thunder_list}')
    return thunder_list

#initialization of dataframe for all weather data, a list of years, and dataframe for all stations
weather_df_dict={}
year_list=list(map(str,range(year_start,year_end+1)))
#station dataframe has 1 column for the station name, a column for each year's combined thunderstorms,
    #and a total average thunderstorms per year across all years
stations_lightning_info=pd.DataFrame(columns=['NAME']+year_list+['AVERAGE']+['WEIGHTED_AVERAGE'])
#loop through each year, finding unique stations and adding them to the new dataframe
for year in range(year_start,year_end+1):
    print(f"\rLoading file: {year}_data.csv          ", end = "", flush = True)
    #read in weather data for each year and clean
    weather_df_dict[year]=pd.read_csv(f'Weather_data/{year}_data.csv')
    weather_df_dict[year].dropna(ignore_index=True,inplace=True)
    #grab unique station names for that year
    station_list=weather_df_dict[year]['NAME'].drop_duplicates(keep='first').reset_index(drop=True)
    #for first iteration, set the station_lightning_info dataframe to the list of unique station names
    if stations_lightning_info.empty:
        stations_lightning_info['NAME']=station_list
    #after first year, concatinate all unique station names for each year onto the dataframe
    else:
        stations_lightning_info=pd.concat([stations_lightning_info,station_list.to_frame(name='NAME')],ignore_index=True)
        #drop all duplicate stations that were added but existed in previous years
        stations_lightning_info.drop_duplicates(subset=['NAME'],inplace=True)
print('Complete')

stations_lightning_info.reset_index(inplace=True,drop=True)
total_stations_list=len(stations_lightning_info)
#calculate yearly thunder instances for each station and a total average of thunder instances per year across all years
stations_lightning_info[year_list+['AVERAGE']+['WEIGHTED_AVERAGE']]=stations_lightning_info.apply(lambda x:
                                                                             avg_yearly_thunder(x['NAME'],x.name),axis=1,result_type='expand')
print('Complete')
#save data to a csv
stations_lightning_info.to_csv(f'Thunder_data_{year_start}_{year_end}.csv',index=False)
print(f'Thunder_data_{year_start}_{year_end}.csv save complete.')

Loading file: 2025_data.csv          Complete
Calculating thunder data for station# 13514/13514 in 2025           Complete
Thunder_data_2020_2025.csv save complete.
