In [2]:
import gdown
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from google.colab import drive
from google.colab import files
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.preprocessing import StandardScaler



In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Loading in data from file uploaded to google drive,

weather_data_file_id = "1-mUFVBKl69Gex8VkqdCVeNF-AmwlQX-k"
url = f'https://drive.google.com/uc?id={weather_data_file_id}'

output = 'merged_weather_data.csv'
gdown.download(url, output, quiet=False)

weather_data_df = pd.read_csv(output)


Downloading...
From: https://drive.google.com/uc?id=1-mUFVBKl69Gex8VkqdCVeNF-AmwlQX-k
To: /content/merged_weather_data.csv
100%|██████████| 67.0M/67.0M [00:00<00:00, 121MB/s]
  weather_data_df = pd.read_csv(output)


In [5]:
# This code processes the 'WND' column to extract valid wind speed data.
# It first filters rows where 'WND' has exactly 5 comma-separated parts and a numeric wind speed (2nd element).
# Then, it extracts the wind speed (as an integer) from those valid rows and assigns it to a new column.

valid_wnd = weather_data_df['WND'].str.split(',')
valid_rows_mask = valid_wnd.apply(lambda x: isinstance(x, list) and len(x) == 5 and x[1].isdigit())

weather_data_df_valid = weather_data_df[valid_rows_mask].copy()

weather_data_df_valid['wind_speed'] = weather_data_df_valid['WND'].str.split(',').str[1].astype(int)

weather_data_df['wind_speed'] = weather_data_df_valid['wind_speed']


In [6]:
# This code defines a function to extract and clean temperature data from the 'TMP' column.
# It splits each value at the comma, converts the first part to an integer, and checks for the placeholder value 9999.
# If valid, it converts the temperature to Celsius by dividing by 10; otherwise, it returns None.
# The function is applied to create a new 'temperature_c' column with cleaned temperature values.

def extract_temp(value):
    try:
        temp_str = value.split(',')[0]
        temp = int(temp_str)
        return None if temp == 9999 else temp / 10
    except:
        return None

weather_data_df['temperature_c'] = weather_data_df['TMP'].apply(extract_temp)




In [7]:
def extract_coverage(df, ga1_col='GA1', new_col='sky_coverage'):
    """
    Extracts sky coverage code from GA1 column and stores it in a new column.

    Parameters:
    - df: DataFrame containing the GA1 column
    - ga1_col: column name with GA1 values
    - new_col: name of the new column to store the coverage codes

    Returns:
    - DataFrame with an added column for sky coverage
    """
    def get_coverage(code):
        if isinstance(code, str):
            return code.split(',')[0] if ',' in code else None
        return None

    df[new_col] = df[ga1_col].apply(get_coverage)
    return df

weather_data_df = extract_coverage(weather_data_df)

In [8]:
def extract_precipitation_amount(df, aa1_col='AA1', new_col='precipitation_mm'):
    """
    Extracts the precipitation amount in millimeters from the AA1 code.

    Parameters:
    - df: DataFrame containing the AA1 column
    - aa1_col: column name containing AA1 code strings
    - new_col: name of the output column for precipitation amounts in mm

    Returns:
    - Modified DataFrame with an additional column for precipitation amounts in mm
    """
    def extract_amount(code):
        if isinstance(code, str):
            try:
                # The second field is the precipitation amount in millimeters
                precip_mm = int(code.split(',')[1])
                return precip_mm
            except (IndexError, ValueError):
                return float('nan')
        return float('nan')

    df[new_col] = df[aa1_col].apply(extract_amount)
    return df


weather_data_df = extract_precipitation_amount(weather_data_df)


In [16]:
irrelevant_cols = [
    'STATION', 'SOURCE', 'REPORT_TYPE', 'CALL_SIGN',
    'QUALITY_CONTROL', 'source_file',
      'RH1', 'DEW', 'LONGITUDE', 'LATITUDE',
    'ELEVATION',  'WND', 'TMP', 'weather_code', 'AA1', 'GA1', 'weather_code'
]

weather_df_clean = weather_data_df.drop(columns=irrelevant_cols)
