In [12]:
import requests
import pandas as pd
import time
import os
from config import token
import sqlite3
import warnings
import calendar

In [18]:
def fetch_save_data(url, headers, params, csv_filename):
    results = []
    
    while True:
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 200:
            data = response.json()
            df = pd.DataFrame(data['results'])
            results.append(df)
            
            if len(df) < params['limit']:
                break
            
            params['offset'] += params['limit']
        else:
            print("Error:", response.status_code)
            break
    
    df_name = pd.concat(results, ignore_index=True)
    response_count = df_name.shape[0]
    print(f"Response Count for{csv_filename}: {response_count}")
    
    df_name.to_csv(csv_filename, index=False)
    return df_name

# Define the API endpoint URL, headers, pagination parameters, and base CSV filename
api_url = "https://www.ncei.noaa.gov/cdo-web/api/v2/data"
api_headers = {"token": token}
limit = 1000
offset= 0
datasetid = 'GSOM'
datatypeids = ['TMAX','TAVG','TMIN'] #'TMAX','TAVG','TMIN' 'PRCP'
locationid = 'FIPS:US'  # US FIPS code, california is 06

# Define the start and end dates for the range of months you want to fetch
start_year = 2020
start_month =1
end_year = 2020
end_month = 12  # Adjust this based on how many months you want to fetch

# Specify the number of designated iterations (months)
iterations = end_month - start_month + 1

# Loop through the specified range of months
for _ in range(iterations):
    start_date = f"{start_year}-{start_month:02d}-01"
    end_day = calendar.monthrange(start_year, start_month)[1]
    end_date = f"{start_year}-{start_month:02d}-{end_day:02d}"
    
    api_params = {
        'offset': offset,
        'datasetid': datasetid,
        'startdate': start_date,
        'enddate': end_date,
        'locationid': locationid,
        'limit': limit,
        'datatypeid': ','.join(datatypeids)
    }
    
    output_csv_filename = f"Outputs/US_data/temp_{start_year}_{start_month:02d}.csv" #change between temp and prcp and others
    
    # Call the function with the defined values
    temp_df = fetch_save_data(api_url, api_headers, api_params, output_csv_filename)
    
    # Increment the start_month and start_year for the next iteration
    start_month += 1
    if start_month > 12:
        start_month = 1
        start_year += 1


Response Count forOutputs/US_data/temp_2020_01.csv: 10432
Response Count forOutputs/US_data/temp_2020_02.csv: 10468
Response Count forOutputs/US_data/temp_2020_03.csv: 10071
Response Count forOutputs/US_data/temp_2020_04.csv: 10110
Response Count forOutputs/US_data/temp_2020_05.csv: 10073
Response Count forOutputs/US_data/temp_2020_06.csv: 10079
Response Count forOutputs/US_data/temp_2020_07.csv: 10068
Response Count forOutputs/US_data/temp_2020_08.csv: 10046
Response Count forOutputs/US_data/temp_2020_09.csv: 10074
Error: 503
Response Count forOutputs/US_data/temp_2020_10.csv: 4000
Response Count forOutputs/US_data/temp_2020_11.csv: 9967
Response Count forOutputs/US_data/temp_2020_12.csv: 9980


In [3]:
# Combine all daily CSV files into a single DataFrame
combined_df = pd.concat([pd.read_csv(os.path.join("Outputs/US_data", file)) for file in os.listdir("Outputs/US_data")])

# Save the combined DataFrame as a CSV file
combined_csv_filename = 'Outputs/combined_data.csv'
combined_df.to_csv(combined_csv_filename, index=False)

combined_df.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:AQW00061705,",,,W",390.3
1,2018-01-01T00:00:00,PRCP,GHCND:CA001018611,",,,C",117.6
2,2018-01-01T00:00:00,PRCP,GHCND:CA001135126,",,,C",67.6
3,2018-01-01T00:00:00,PRCP,GHCND:CA005020881,",,,C",3.6
4,2018-01-01T00:00:00,PRCP,GHCND:CA006020559,",,,C",21.6


In [4]:
combined_df.shape

(617457, 5)

In [101]:
def fetch_save_data(url, headers, params, csv_filename):
    results = []
    
    while True:
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 200:
            data = response.json()
            df = pd.DataFrame(data['results'])
            results.append(df)
            
            if len(df) < params['limit']:
                break
            
            params['offset'] += params['limit']
        else:
            print("Error:", response.status_code)
            break
    
    df_name = pd.concat(results, ignore_index=True)
    response_count = df_name.shape[0]
    print(f"Response Count: {response_count}")
    
    df_name.to_csv(csv_filename, index=False)
    return df_name


In [None]:
# Define the API endpoint URL, headers, pagination parameters, and CSV filename
api_url = "https://www.ncei.noaa.gov/cdo-web/api/v2/stations"
api_headers = {"token": token}
api_params = {'offset': 0, 
              'limit': 1000,
              'locationid': 'FIPS:US' #CA code is 06
             }
output_csv_filename = 'Outputs/full_station_list.csv'

# Call the function with the defined values
stations_df = fetch_save_data(api_url, api_headers, api_params, output_csv_filename)

stations_df.tail()

In [None]:
# Define the API endpoint URL, headers, pagination parameters, and CSV filename
api_url = "https://www.ncei.noaa.gov/cdo-web/api/v2/data"
api_headers = {"token": token}

# Set up pagination parameters
offset = 0
limit = 1000
datasetid = 'GSOM'
datatypeids = ['TAVG','TMIN','TMAX','PRCP']
startdate = '2018-01-01'
enddate = '2018-01-31'
locationid = 'FIPS:US'  # California FIPS code

api_params = {
    'datasetid': datasetid,
    'startdate': startdate,
    'locationid': locationid,
    'enddate': enddate,
    'offset': offset,
    'limit': limit
}

# Combine datatypeids into a comma-separated string
api_params['datatypeid'] = ','.join(datatypeids)

output_csv_filename = 'f(Outputs/US_data/prcp_{year}_{month}.csv)

# Call the function with the defined values
temp_df = fetch_save_data(api_url, api_headers, api_params, output_csv_filename)

# Display the first few rows of the DataFrame
temp_df.tail(10)

In [27]:
# Read the data from your CSV or Excel file
file_path = 'Outputs/combined_data.csv'
data_df = pd.read_csv(file_path)

# Convert 'value' column to float, handling invalid entries as NaN
data_df['value'] = pd.to_numeric(data_df['value'], errors='coerce')

# Initialize empty lists to store the data
dates = []
stations = []
tmax_values = []
tmin_values = []
tavg_values = []
prcp_values = []

# Iterate through the rows of the original DataFrame
for index, row in data_df.iterrows():
    date = row['date']
    station = row['station']
    datatype = row['datatype']
    value = row['value']
    
    if datatype == 'TMAX':
        tmax_values.append(value)
        tmin_values.append(None)
        tavg_values.append(None)
        prcp_values.append(None)
    elif datatype == 'TMIN':
        tmax_values.append(None)
        tmin_values.append(value)
        tavg_values.append(None)
        prcp_values.append(None)
    elif datatype == 'TAVG':
        tmax_values.append(None)
        tmin_values.append(None)
        tavg_values.append(value)
        prcp_values.append(None)
    elif datatype == 'PRCP':
        tmax_values.append(None)
        tmin_values.append(None)
        tavg_values.append(None)
        prcp_values.append(value)
    
    dates.append(date)
    stations.append(station)

# Create a new DataFrame
new_data = {
    'date': dates,
    'station': stations,
    'TMAX': tmax_values,
    'TMIN': tmin_values,
    'TAVG': tavg_values,
    'PRCP': prcp_values
}

new_df = pd.DataFrame(new_data)

# Convert all columns except 'date' and 'station' to float
float_columns = new_df.columns.difference(['date', 'station'])
new_df[float_columns] = new_df[float_columns].astype(float)

# Group by date and station and keep non-null values
grouped_df = new_df.groupby(['date', 'station']).first().reset_index()

grouped_df = grouped_df.dropna(subset=['TMAX', 'TMIN', 'TAVG','PRCP'])

# Write the grouped DataFrame to a CSV file
csv_filename = 'Outputs/grouped_df.csv'
grouped_df.to_csv(csv_filename, index=False)

# Print the first few rows of the grouped DataFrame
grouped_df.head(10)

Unnamed: 0,date,station,TMAX,TMIN,TAVG,PRCP
0,2018-01-01T00:00:00,GHCND:AQW00061705,30.58,25.12,27.85,390.3
1,2018-01-01T00:00:00,GHCND:CA001018611,7.98,4.63,6.3,117.6
2,2018-01-01T00:00:00,GHCND:CA001135126,1.1,-5.44,-2.17,67.6
3,2018-01-01T00:00:00,GHCND:CA005020881,-8.93,-17.87,-13.4,3.6
4,2018-01-01T00:00:00,GHCND:CA006020559,-8.5,-19.77,-14.14,21.6
5,2018-01-01T00:00:00,GHCND:CQC00914080,27.69,23.23,25.46,273.8
6,2018-01-01T00:00:00,GHCND:CQC00914801,28.95,24.59,26.77,57.0
7,2018-01-01T00:00:00,GHCND:CQC00914855,30.82,24.71,27.77,59.1
9,2018-01-01T00:00:00,GHCND:GQW00041415,31.1,24.99,28.05,23.8
25,2018-01-01T00:00:00,GHCND:RQC00662801,29.05,20.31,24.68,108.1


In [28]:
grouped_df.describe()


Unnamed: 0,TMAX,TMIN,TAVG,PRCP
count,82189.0,82189.0,82189.0,82189.0
mean,18.255401,5.625559,11.940702,84.738435
std,10.941046,10.380864,10.53705,73.186178
min,-17.21,-28.71,-22.79,0.0
25%,9.69,-2.41,3.6,28.0
50%,19.36,5.77,12.63,69.2
75%,27.67,14.41,21.04,122.1
max,49.19,35.39,42.29,950.1


In [None]:
grouped_df = grouped_df.dropna(subset=['TMAX', 'TMIN', 'TAVG','PRCP'])

# Write the grouped DataFrame to a CSV file
csv_filename = 'Outputs/grouped_df_detailed.csv'
grouped_df.to_csv(csv_filename, index=False)

grouped_df.head()

In [None]:
# Folder containing the CSV files
folder = 'Outputs/US_data'

# List of CSV filenames
filenames = ['ca_station_temps_2018_2019_1.csv','ca_station_temps_2018_2019.csv']

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Loop through the filenames and read each CSV into a DataFrame
for filename in filenames:
    file_path = os.path.join(folder, filename)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    else:
        print(f"File not found: {file_path}")

# Remove duplicates from the combined DataFrame
combined_df = combined_df.drop_duplicates()

# Save the combined and deduplicated DataFrame to a new CSV file
output_filename = os.path.join('Outputs/combined_temps.csv')
combined_df.to_csv(output_filename, index=False)

print(f"Combined CSVs and removed duplicates. Saved as '{output_filename}'.")

In [None]:
# Read the data from your CSV or Excel file
file_path = 'Outputs/combined_temps.csv'
data_df = pd.read_csv(file_path)

# Convert 'value' column to float, handling invalid entries as NaN
data_df['value'] = pd.to_numeric(data_df['value'], errors='coerce')

# Initialize empty lists to store the data
dates = []
stations = []
tmax_values = []
tmin_values = []
tavg_values = []
prcp_values = []

# Iterate through the rows of the original DataFrame
for index, row in data_df.iterrows():
    date = row['date']
    station = row['station']
    datatype = row['datatype']
    value = row['value']
    
    if datatype == 'TMAX':
        tmax_values.append(value)
        tmin_values.append(None)
        tavg_values.append(None)
        prcp_values.append(None)
    elif datatype == 'TMIN':
        tmax_values.append(None)
        tmin_values.append(value)
        tavg_values.append(None)
        prcp_values.append(None)
    elif datatype == 'TAVG':
        tmax_values.append(None)
        tmin_values.append(None)
        tavg_values.append(value)
        prcp_values.append(None)
    elif datatype == 'PRCP':
        tmax_values.append(None)
        tmin_values.append(None)
        tavg_values.append(None)
        prcp_values.append(value)
    
    dates.append(date)
    stations.append(station)

# Create a new DataFrame
new_data = {
    'date': dates,
    'station': stations,
    'TMAX': tmax_values,
    'TMIN': tmin_values,
    'TAVG': tavg_values,
    'PRCP': prcp_values
}

new_df = pd.DataFrame(new_data)

# Convert all columns except 'date' and 'station' to float
float_columns = new_df.columns.difference(['date', 'station'])
new_df[float_columns] = new_df[float_columns].astype(float)

# Group by date and station and keep non-null values
grouped_df = new_df.groupby(['date', 'station']).first().reset_index()

# Drop rows with NaN values in 'TMAX', 'TMIN', and 'TAVG' columns
subset_columns = ['TMAX', 'TMIN', 'TAVG']
grouped_df = grouped_df.dropna(subset=subset_columns, how='all')

# Write the grouped DataFrame to a CSV file
csv_filename = 'Outputs/grouped_df.csv'
grouped_df.to_csv(csv_filename, index=False)

# Print the first few rows of the grouped DataFrame
grouped_df.head()

In [6]:
# Load the stations.csv file into stations_df
stations_df = pd.read_csv('Outputs/full_station_list.csv')
grouped_df= pd.read_csv('Outputs/grouped_df.csv')

# Merge the two DataFrames based on 'station' using a left join
grouped_df_detailed = pd.merge(grouped_df, stations_df, 
                               left_on='station', right_on='id', how='left')

# Drop the redundant columns (from stations_df)
grouped_df_detailed.drop(columns=['id'], inplace=True)

grouped_df_detailed=grouped_df_detailed[['station','name','latitude','longitude','elevation','date','maxdate','mindate','TAVG','TMAX','TMIN','PRCP']]

# Write the grouped DataFrame to a CSV file
csv_filename = 'Outputs/grouped_df_detailed.csv'
grouped_df_detailed.to_csv(csv_filename, index=False)

grouped_df_detailed.head()

Unnamed: 0,station,name,latitude,longitude,elevation,date,maxdate,mindate,TAVG,TMAX,TMIN,PRCP
0,GHCND:AQW00061705,"PAGO PAGO WEATHER SERVICE OFFICE AIRPORT, US",-14.33056,-170.71361,3.7,2018-01-01T00:00:00,2023-07-31,1945-08-01,27.85,30.58,25.12,390.3
1,GHCND:CA001018611,"VICTORIA GONZALES CS, WA US",48.0333,-123.3333,70.0,2018-01-01T00:00:00,2023-07-31,1973-01-01,6.3,7.98,4.63,117.6
2,GHCND:CA001135126,"MIDWAY, WA US",49.0,-118.7667,580.0,2018-01-01T00:00:00,2023-06-11,1987-06-01,-2.17,1.1,-5.44,67.6
3,GHCND:CA005020881,"EMERSON AUTO, ND US",49.0,-97.2333,242.0,2018-01-01T00:00:00,2023-07-31,2009-07-01,-13.4,-8.93,-17.87,3.6
4,GHCND:CA006020559,"BARWICK, MN US",48.6333,-93.9667,335.0,2018-01-01T00:00:00,2023-07-31,1978-12-01,-14.14,-8.5,-19.77,21.6


In [3]:
wildfire=pd.read_csv('Resources/data.csv')

In [11]:
wildfire.columns

Index(['OBJECTID', 'Shape', 'FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE',
       'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID',
       'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT',
       'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID',
       'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME',
       'ICS_209_PLUS_INCIDENT_JOIN_ID', 'ICS_209_PLUS_COMPLEX_JOIN_ID',
       'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME', 'FIRE_YEAR',
       'DISCOVERY_DATE', 'DISCOVERY_DOY', 'DISCOVERY_TIME',
       'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE',
       'NWCG_CAUSE_AGE_CATEGORY', 'CONT_DATE', 'CONT_DOY', 'CONT_TIME',
       'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'LATITUDE', 'LONGITUDE', 'OWNER_DESCR',
       'STATE', 'COUNTY', 'FIPS_CODE', 'FIPS_NAME'],
      dtype='object')

In [4]:
wildfires=wildfire[['LATITUDE', 'LONGITUDE', 'FIRE_SIZE','NWCG_REPORTING_UNIT_NAME', 'FIRE_SIZE_CLASS', 'FIRE_YEAR',
       'FPA_ID', 'FIRE_CODE', 'NWCG_CAUSE_CLASSIFICATION',
       'NWCG_GENERAL_CAUSE', 'FIRE_NAME', 'DISCOVERY_DATE', 'CONT_DATE',
       'DISCOVERY_TIME', 'CONT_TIME', 'STATE', 'COUNTY', 'FIPS_CODE'
       ]]
wildfires=wildfires[
    (wildfires['FIRE_YEAR'].isin([2018,2019]))
]
wildfires.head()

Unnamed: 0,LATITUDE,LONGITUDE,FIRE_SIZE,NWCG_REPORTING_UNIT_NAME,FIRE_SIZE_CLASS,FIRE_YEAR,FPA_ID,FIRE_CODE,NWCG_CAUSE_CLASSIFICATION,NWCG_GENERAL_CAUSE,FIRE_NAME,DISCOVERY_DATE,CONT_DATE,DISCOVERY_TIME,CONT_TIME,STATE,COUNTY,FIPS_CODE
2045714,46.275833,-114.379167,0.1,Bitterroot National Forest,A,2018,FS-6911076,EKS4,Natural,Natural,BLODGETT,8/22/2018,8/22/2018,1625.0,1740.0,MT,81,30081.0
2045715,46.404167,-113.921944,0.1,Bitterroot National Forest,A,2018,FS-6908885,L1RX,Human,Equipment and vehicle use,CORLEY GULCH,7/26/2018,7/28/2018,1225.0,1653.0,MT,81,30081.0
2045716,46.245833,-114.308889,1.0,Bitterroot National Forest,B,2018,FS-6898061,L49X,Human,Recreation and ceremony,CANYON CREEK,9/21/2018,9/23/2018,1305.0,1241.0,MT,81,30081.0
2045717,45.784722,-114.033056,0.1,Bitterroot National Forest,A,2018,FS-6890683,EKS4,Natural,Natural,MAYNARD CREEK,8/17/2018,,1723.0,,MT,81,30081.0
2045718,45.986944,-113.807222,0.1,Bitterroot National Forest,A,2018,FS-6888073,EKS4,Natural,Natural,BLUE,8/12/2018,8/12/2018,1031.0,1334.0,MT,81,30081.0


In [13]:
wildfires.shape

(144417, 18)

In [30]:
#Takes 4-10 minutes to run 1 year, 25-30  minutes for 2 years
warnings.filterwarnings("ignore")

wildfires['DISCOVERY_DATE'] = pd.to_datetime(wildfires['DISCOVERY_DATE'])
grouped_df_detailed['date'] = pd.to_datetime(grouped_df_detailed['date'])

def find_nearest_match(row, df, date_col, lat_col, lon_col):
    date_diff = abs((df[date_col] - row['DISCOVERY_DATE']).dt.total_seconds())
    lat_diff = abs(df[lat_col] - row['LATITUDE'])
    lon_diff = abs(df[lon_col] - row['LONGITUDE'])
    total_diff = date_diff + lat_diff + lon_diff
    nearest_idx = total_diff.idxmin()
    return df.loc[nearest_idx]

# Create an empty list to hold the merged rows
merged_rows = []

# Iterate over each row in the filtered wildfires DataFrame
for idx, row in wildfires.iterrows():
    nearest_row = find_nearest_match(row, grouped_df_detailed, 'date', 'latitude', 'longitude')
    merged_row = pd.concat([row, nearest_row])
    merged_rows.append(merged_row)

# Concatenate the list of merged rows into a DataFrame
merged_results = pd.concat(merged_rows, axis=1).T

merged_results=merged_results[['LATITUDE', 'LONGITUDE','COUNTY', 'FIPS_CODE','FIRE_SIZE', 'FIRE_SIZE_CLASS',
       'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'FIRE_NAME',
       'DISCOVERY_DATE', 'CONT_DATE', 'DISCOVERY_TIME', 'CONT_TIME', 'STATE',
       'station', 'name', 'latitude', 'longitude', 'elevation', 'date', 'TAVG',
       'TMAX', 'TMIN','PRCP']]

csv_filename = 'Outputs/merged_results.csv'
merged_results.to_csv(csv_filename, index=False)

merged_results.head(10)

Unnamed: 0,LATITUDE,LONGITUDE,COUNTY,FIPS_CODE,FIRE_SIZE,FIRE_SIZE_CLASS,NWCG_CAUSE_CLASSIFICATION,NWCG_GENERAL_CAUSE,FIRE_NAME,DISCOVERY_DATE,...,station,name,latitude,longitude,elevation,date,TAVG,TMAX,TMIN,PRCP
0,46.275833,-114.379167,81,30081.0,0.1,A,Natural,Natural,BLODGETT,2018-08-22,...,GHCND:USC00243885,"HAMILTON, MT US",46.24622,-114.16794,1092.7,2018-09-01,12.89,21.95,3.82,3.5
1,46.404167,-113.921944,81,30081.0,0.1,A,Human,Equipment and vehicle use,CORLEY GULCH,2018-07-26,...,GHCND:USC00247894,"STEVENSVILLE, MT US",46.5137,-114.091,1028.7,2018-08-01,18.29,28.05,8.53,8.6
2,46.245833,-114.308889,81,30081.0,1.0,B,Human,Recreation and ceremony,CANYON CREEK,2018-09-21,...,GHCND:USC00243885,"HAMILTON, MT US",46.24622,-114.16794,1092.7,2018-10-01,6.96,13.28,0.65,33.7
3,45.784722,-114.033056,81,30081.0,0.1,A,Natural,Natural,MAYNARD CREEK,2018-08-17,...,GHCND:USC00242221,"DARBY, MT US",46.0263,-114.1763,1182.6,2018-09-01,11.77,20.79,2.74,9.1
4,45.986944,-113.807222,81,30081.0,0.1,A,Natural,Natural,BLUE,2018-08-12,...,GHCND:USC00247967,"SULA 14 NE, MT US",45.911,-113.7394,1571.2,2018-08-01,15.1,26.09,4.12,19.4
5,46.023056,-113.799722,81,30081.0,0.1,A,Natural,Natural,POLLYWOG,2018-08-11,...,GHCND:USC00247967,"SULA 14 NE, MT US",45.911,-113.7394,1571.2,2018-08-01,15.1,26.09,4.12,19.4
6,45.913611,-114.6675,49,16049.0,0.1,A,Natural,Natural,CEDAR,2018-07-25,...,GHCND:USC00108246,"SELWAY LODGE, ID US",46.0081,-114.8442,786.4,2018-08-01,19.38,31.89,6.88,21.1
7,46.121111,-114.239167,81,30081.0,0.1,A,Natural,Natural,DOUBLE STRIKE,2018-06-08,...,GHCND:USC00242221,"DARBY, MT US",46.0263,-114.1763,1182.6,2018-06-01,14.62,21.15,8.09,102.7
8,45.868333,-113.804167,81,30081.0,0.25,A,Natural,Natural,MEADOW,2018-08-17,...,GHCND:USC00247967,"SULA 14 NE, MT US",45.911,-113.7394,1571.2,2018-09-01,9.69,20.79,-1.42,11.2
9,45.914444,-114.635278,49,16049.0,0.1,A,Natural,Natural,MT GEORGE 2,2018-07-24,...,GHCND:USC00108246,"SELWAY LODGE, ID US",46.0081,-114.8442,786.4,2018-08-01,19.38,31.89,6.88,21.1


In [6]:
merged_results['name'].value_counts()




name
LITCHFIELD PARK, AZ US           1692
VICTORIA GONZALES CS, WA US      1480
EAST MESA, AZ US                 1258
TEMPE ASU, AZ US                 1071
TOHONO CHUL, AZ US               1009
                                 ... 
HACHITA 1 W, NM US                  1
VAN HORN, TX US                     1
FITTSTOWN 6 SW MESONET, OK US       1
HEALDTON 3 E, OK US                 1
SAINT FRANCIS, KS US                1
Name: count, Length: 3526, dtype: int64

In [2]:
# Suppress warnings
warnings.filterwarnings("ignore")

# Read the CSV file
merged_results = pd.read_csv('Outputs/merged_results.csv')

# Select specific columns from the DataFrame
us_data_2018_test = merged_results[['FIRE_NAME', 'STATE', 'FIPS_CODE', 'LATITUDE', 'LONGITUDE', 'DISCOVERY_DATE', 'CONT_DATE', 'name', 'latitude', 'longitude', 'date',
                                    'NWCG_CAUSE_CLASSIFICATION', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
                                    'elevation', 'TAVG', 'TMAX', 'TMIN', 'PRCP']]

# Rename the columns
us_data_2018_test.columns = ['FIRE_NAME', 'STATE', 'FIPS_CODE', 'FIRE_LATITUDE', 'FIRE_LONGITUDE', 'FIRE_DATE', 'CONTAIN_DATE', 'CLOSEST_STATION', 'STATION_LAT', 'STATION_LON', 'READINGS_DATE',
                             'CAUSE_CLASSIFICATION', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
                             'ELEVATION', 'TAVG', 'TMAX', 'TMIN', 'PRCP']

# Convert 'CONTAIN_DATE' and 'FIRE_DATE' columns to datetime
us_data_2018_test['CONTAIN_DATE'] = pd.to_datetime(us_data_2018_test['CONTAIN_DATE'])
us_data_2018_test['FIRE_DATE'] = pd.to_datetime(us_data_2018_test['FIRE_DATE'])

# Calculate the difference between 'CONTAIN_DATE' and 'FIRE_DATE'
us_data_2018_test['DAYS_TO_CONTAIN'] = (us_data_2018_test['CONTAIN_DATE'] - us_data_2018_test['FIRE_DATE']).dt.days

# Replace any NaN values in 'DAYS_TO_CONTAIN' with 1
us_data_2018_test['DAYS_TO_CONTAIN'].fillna(0, inplace=True)

# Define float and int columns
float_columns = {
    'FIRE_LATITUDE': float,
    'FIRE_LONGITUDE': float,
    'STATION_LAT': float,
    'STATION_LON': float,
    'FIRE_SIZE': float,
    'ELEVATION': float,
    'TAVG': float,
    'TMAX': float,
    'TMIN': float,
    'PRCP': float
}

int_columns = {
    'DAYS_TO_CONTAIN': int,
}

# Convert columns to the specified data types
us_data_2018_test = us_data_2018_test.astype({**float_columns, **int_columns})

# Save the DataFrame to a CSV file
csv_filename = 'Outputs/us_data_2018_2019.csv'
us_data_2018_test.to_csv(csv_filename, index=False)

# Display the last 20 rows of the DataFrame
us_data_2018_test.tail(10)


Unnamed: 0,FIRE_NAME,STATE,FIPS_CODE,FIRE_LATITUDE,FIRE_LONGITUDE,FIRE_DATE,CONTAIN_DATE,CLOSEST_STATION,STATION_LAT,STATION_LON,READINGS_DATE,CAUSE_CLASSIFICATION,FIRE_SIZE,FIRE_SIZE_CLASS,ELEVATION,TAVG,TMAX,TMIN,PRCP,DAYS_TO_CONTAIN
144407,BARREN HILL,ID,16049.0,46.236944,-114.9828,2019-07-01,NaT,"SELWAY LODGE, ID US",46.0081,-114.8442,2019-07-01,Natural,1592.0,F,786.4,19.28,31.05,7.51,23.4,0
144408,SAN RAFAEL,AZ,4023.0,31.42304,-110.571,2019-07-01,NaT,"PATAGONIA PATON CENTER, AZ US",31.53923,-110.76028,2019-07-01,Natural,438.0,E,1232.6,25.41,35.49,15.34,65.8,0
144409,ROCK,CA,6099.0,37.472222,-121.249444,2019-06-25,NaT,"TRACY CARBONA, CA US",37.6819,-121.3466,2019-07-01,Missing data/not specified/undetermined,2422.0,F,41.1,24.63,34.24,15.02,0.0,0
144410,CONNEX WF,FL,12091.0,30.523333,-86.781667,2019-10-07,NaT,"NICEVILLE, FL US",30.5316,-86.4928,2019-10-01,Human,970.0,E,22.6,21.55,28.44,14.66,146.7,0
144411,BEAVER POND,MS,28153.0,31.49333,-88.74028,2019-06-13,NaT,"WAYNESBORO 2 W, MS US",31.6773,-88.6709,2019-06-01,Human,168.0,D,61.0,26.21,32.35,20.06,91.8,0
144412,BEN HOWARD HOLLOW,KY,21013.0,36.850278,-83.506667,2019-09-21,NaT,"HARLAN 3 S, KY US",36.8058,-83.3441,2019-10-01,Human,272.0,D,378.0,15.24,22.17,8.31,104.5,0
144413,2019-3354,KS,20139.0,36.71164,-96.74075,2019-11-25,NaT,"RALSTON, OK US",36.5044,-96.7438,2019-12-01,Human,1000.0,F,251.5,5.92,12.9,-1.06,10.2,0
144414,WALKER,CA,6063.0,40.05325,-120.6689,2019-09-04,NaT,"SUSANVILLE 2 SW, CA US",40.4167,-120.6631,2019-09-01,Missing data/not specified/undetermined,54608.0,G,1283.8,14.81,23.62,5.99,35.6,0
144415,OK 745,AL,1007.0,32.99723,-87.30439,2019-09-18,NaT,"BANKHEAD LOCK AND DAM, AL US",33.4527,-87.3572,2019-10-01,Missing data/not specified/undetermined,413.0,E,85.3,19.26,25.34,13.17,174.7,0
144416,204 COW,OR,41001.0,44.28505,-118.4598,2019-08-09,NaT,"JOHN DAY, OR US",44.4233,-118.9594,2019-08-01,Natural,9668.0,G,933.6,19.26,29.83,8.69,8.7,0


In [None]:
warnings.filterwarnings("ignore")

us_data_2018 = merged_results[['FIRE_NAME', 'STATE', 'FIPS_CODE', 'LATITUDE', 'LONGITUDE', 'DISCOVERY_DATE', 'CONT_DATE', 'name', 'latitude', 'longitude', 'date',
                                    'NWCG_CAUSE_CLASSIFICATION', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
                                    'elevation', 'TAVG', 'TMAX', 'TMIN']]

# Rename the columns
us_data_2018.columns = ['FIRE_NAME', 'STATE', 'FIPS_CODE', 'FIRE_LATITUDE', 'FIRE_LONGITUDE', 'FIRE_DATE', 'CONTAIN_DATE', 'CLOSEST_STATION', 'STATION_LAT', 'STATION_LON', 'READINGS_DATE',
                               'CAUSE_CLASSIFICATION', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
                             'ELEVATION', 'TAVG', 'TMAX', 'TMIN']


# Calculate the difference between 'CONTAIN_DATE' and 'FIRE_DATE'
us_data_2018['CONTAIN_DATE'] = pd.to_datetime(us_data_2018['CONTAIN_DATE'])
us_data_2018['DAYS_TO_CONTAIN'] = (us_data_2018['CONTAIN_DATE'] - us_data_2018['FIRE_DATE']).dt.days

# Replace any NaN values in 'DAYS_TO_CONTAIN' with 1
us_data_2018['DAYS_TO_CONTAIN'].fillna(0, inplace=True)

float_columns = {
    'FIRE_LATITUDE': float,
    'FIRE_LONGITUDE': float,
    'STATION_LAT': float,
    'STATION_LON': float,
    'FIRE_SIZE': float,
    'ELEVATION': float,
    'TAVG': float,
    'TMAX': float,
    'TMIN': float
}

int_columns = {
    'DAYS_TO_CONTAIN': int,
}

us_data_2018 = us_data_2018.astype({**float_columns, **int_columns})

csv_filename = 'Outputs/us_data_2018_2019.csv'
us_data_2018.to_csv(csv_filename, index=False)

us_data_2018.head(20)

In [17]:
# Ignore warnings
warnings.filterwarnings("ignore")

# Read your data
wildfires = pd.read_csv('Resources/data.csv')
grouped_df_detailed = pd.read_csv('Outputs/grouped_df_detailed.csv')

# Convert date columns to datetime
wildfires['DISCOVERY_DATE'] = pd.to_datetime(wildfires['DISCOVERY_DATE'])
grouped_df_detailed['date'] = pd.to_datetime(grouped_df_detailed['date'])

# Define the find_nearest_match function
def find_nearest_match(row, df, date_col, lat_col, lon_col):
    date_diff = abs((df[date_col] - row['DISCOVERY_DATE']).dt.total_seconds())
    lat_diff = abs(df[lat_col] - row['LATITUDE'])
    lon_diff = abs(df[lon_col] - row['LONGITUDE'])
    total_diff = date_diff + lat_diff + lon_diff
    nearest_idx = total_diff.idxmin()
    return df.loc[nearest_idx]

# Create a list to hold the merged rows
merged_rows = []

# Iterate over each row in wildfires
for idx, row in wildfires.iterrows():
    nearest_row = find_nearest_match(row, grouped_df_detailed, 'date', 'latitude', 'longitude')
    
    # Add a 'fire?' column to the nearest_row DataFrame and set it to 1
    nearest_row['fire?'] = 1
    
    # Combine the original wildfire row with the matched nearest_row
    merged_row = pd.concat([row, nearest_row])
    
    # Append the merged row to merged_rows list
    merged_rows.append(merged_row)

# Create a temporary DataFrame to hold unmatched rows
unmatched_rows = []

# Create the merged_results_test DataFrame from the list of merged rows
merged_results_test = pd.concat(merged_rows, axis=1).T

# Iterate over each row in grouped_df_detailed
for idx, row in grouped_df_detailed.iterrows():
    if idx not in merged_results_test.index:
        # Add a 'fire?' column to the row and set it to 0
        row['fire?'] = 0
        unmatched_rows.append(row)

# Concatenate unmatched_rows with merged_rows
merged_rows += unmatched_rows

# Reorder columns
merged_results_test = merged_results_test[['LATITUDE', 'LONGITUDE','COUNTY', 'FIPS_CODE','FIRE_SIZE', 'FIRE_SIZE_CLASS',
       'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'FIRE_NAME',
       'DISCOVERY_DATE', 'CONT_DATE', 'DISCOVERY_TIME', 'CONT_TIME', 'STATE',
       'station', 'name', 'latitude', 'longitude', 'elevation', 'date', 'TAVG',
       'TMAX', 'TMIN','PRCP', 'fire?']]

# Save to CSV
csv_filename = 'Outputs/merged_results_test.csv'
merged_results_test.to_csv(csv_filename, index=False)


KeyboardInterrupt: 

In [8]:
merged_results_test.head()

Unnamed: 0,LATITUDE,LONGITUDE,COUNTY,FIPS_CODE,FIRE_SIZE,FIRE_SIZE_CLASS,NWCG_CAUSE_CLASSIFICATION,NWCG_GENERAL_CAUSE,FIRE_NAME,DISCOVERY_DATE,...,station,name,latitude,longitude,elevation,date,TAVG,TMAX,TMIN,PRCP
0,46.275833,-114.379167,81,30081.0,0.1,A,Natural,Natural,BLODGETT,2018-08-22,...,GHCND:USC00243885,"HAMILTON, MT US",46.24622,-114.16794,1092.7,2018-09-01,12.89,21.95,3.82,3.5
1,46.404167,-113.921944,81,30081.0,0.1,A,Human,Equipment and vehicle use,CORLEY GULCH,2018-07-26,...,GHCND:USC00247894,"STEVENSVILLE, MT US",46.5137,-114.091,1028.7,2018-08-01,18.29,28.05,8.53,8.6
2,46.245833,-114.308889,81,30081.0,1.0,B,Human,Recreation and ceremony,CANYON CREEK,2018-09-21,...,GHCND:USC00243885,"HAMILTON, MT US",46.24622,-114.16794,1092.7,2018-10-01,6.96,13.28,0.65,33.7
3,45.784722,-114.033056,81,30081.0,0.1,A,Natural,Natural,MAYNARD CREEK,2018-08-17,...,GHCND:USC00242221,"DARBY, MT US",46.0263,-114.1763,1182.6,2018-09-01,11.77,20.79,2.74,9.1
4,45.986944,-113.807222,81,30081.0,0.1,A,Natural,Natural,BLUE,2018-08-12,...,GHCND:USC00247967,"SULA 14 NE, MT US",45.911,-113.7394,1571.2,2018-08-01,15.1,26.09,4.12,19.4


In [10]:
merged_results_test.columns

Index(['LATITUDE', 'LONGITUDE', 'COUNTY', 'FIPS_CODE', 'FIRE_SIZE',
       'FIRE_SIZE_CLASS', 'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE',
       'FIRE_NAME', 'DISCOVERY_DATE', 'CONT_DATE', 'DISCOVERY_TIME',
       'CONT_TIME', 'STATE', 'station', 'name', 'latitude', 'longitude',
       'elevation', 'date', 'TAVG', 'TMAX', 'TMIN', 'PRCP'],
      dtype='object')

In [9]:
merged_results_test.shape

(144417, 24)