# Team Raj Datathon 2025

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

In [13]:
# reading in data
visitation_df=pd.read_csv('data/visitation_data.csv')
climate_df=pd.read_csv('data/climate_data.csv')

In [None]:

# cleaning visitors data
DATA_DIR = Path("data")
VIS_CSV = DATA_DIR / "data/visitation_data.csv"
OUT_TIDY = DATA_DIR / "visitation_tidy_imputed.csv" 

# ---------- 1) load ----------
df = pd.read_csv(VIS_CSV)

# column names
df.columns = (
    df.columns.str.strip()
              .str.replace(r"\s+", "_", regex=True)
              .str.replace(".", "", regex=False)  # remove literal dots from "Mt. Buller"
              .str.lower()
)
# expect: year, week, mt_baw_baw, mt_stirling, ..., charlotte_pass

# ---------- 3) reshape to tidy ----------
id_cols = ["year", "week"]
value_cols = [c for c in df.columns if c not in id_cols]
vis = df.melt(id_vars=id_cols, value_vars=value_cols,
              var_name="resort", value_name="visitors")

# clean resort labels & ensure numeric visitors
vis["resort"] = vis["resort"].str.replace("_", " ").str.title()
vis["visitors"] = pd.to_numeric(vis["visitors"], errors="coerce").clip(lower=0)


# REPLACING 202O W10-15 VALUES WITH AVERGAE OF THE PREVIOUS YEAR SAME WEEK AND THE WEEK BEFORE FROM A LOCATION

# ensure numeric & float during imputation
vis["visitors"] = pd.to_numeric(vis["visitors"], errors="coerce").astype("float64")

# Base lookup table
base = vis[["year", "week", "resort", "visitors"]].copy()

# 1) Prev year, same week
prev_same = base.rename(columns={"year":"py_year","week":"py_week","visitors":"v_prev_same"})
m1 = vis.copy()
m1["year_minus1"] = m1["year"] - 1
m1 = m1.merge(
    prev_same,
    left_on=["resort","year_minus1","week"],
    right_on=["resort","py_year","py_week"],
    how="left"
).drop(columns=["py_year","py_week"])

# 2) Prev year, week-1  (week 1 -> 0 won’t match; that’s OK)
prev_before = base.rename(columns={"year":"py_year2","week":"py_week2","visitors":"v_prev_before"})
m1["week_minus1"] = m1["week"] - 1
m2 = m1.merge(
    prev_before,
    left_on=["resort","year_minus1","week_minus1"],
    right_on=["resort","py_year2","py_week2"],
    how="left"
).drop(columns=["py_year2","py_week2"])

# 3) Row-wise mean of available previous-year values (no numpy vstack)
m2["imputed_prev_year_mean"] = (
    pd.concat([m2["v_prev_same"], m2["v_prev_before"]], axis=1)
      .mean(axis=1, skipna=True)   # returns float; no warnings
)
 
# 4) Replace zeros where we have an imputed value (keep dtype float here)
mask_zero = m2["visitors"].eq(0)
mask_have = m2["imputed_prev_year_mean"].notna()
m2.loc[mask_zero & mask_have, "visitors"] = m2.loc[mask_zero & mask_have, "imputed_prev_year_mean"]

# Commit back to vis (still float)
vis = m2[["year","week","resort","visitors"]].copy()

# FOR THE FIRST YEAR, DONT HAVE PREVIOUS YEAR DATA, SO TOOK AVERAGE OF THE PAST 2 WEEKS FOR CURRENT WEEK
def fill_first_year_with_prev2weeks(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["resort","year","week"]).copy()
    first_year = df["year"].min()
    target = (df["year"] == first_year) & (df["visitors"] == 0)
    for idx, row in df[target].iterrows():
        r, y, w = row["resort"], int(row["year"]), int(row["week"])
        prev = df[(df["resort"]==r) & (df["year"]==y) & (df["week"].between(w-2, w-1))]["visitors"]
        if len(prev) > 0:
            df.at[idx, "visitors"] = prev.mean()   # float assign is OK
    return df

vis = fill_first_year_with_prev2weeks(vis)

# ---------- FINALIZE DTYPE ----------
# Only after all imputations: round and cast to Int64 (nullable)
vis["visitors"] = vis["visitors"].round().astype("Int64")
print("Remaining zeros after imputation:", int((vis["visitors"] == 0).sum()))

# Diff log: which zeros changed?
before = df.melt(id_vars=["year","week"],
                 value_vars=[c for c in df.columns if c not in ["year","week"]],
                 var_name="resort", value_name="visitors_before")
before["resort"] = before["resort"].str.replace("_"," ").str.title()

chg = vis.merge(before, on=["year","week","resort"], how="left")
changed = chg[(chg["visitors_before"] == 0) & (chg["visitors"].notna()) & (chg["visitors"] != 0)]
print(f"Replacements made: {len(changed)}")
print(changed.head(20)[["resort","year","week","visitors_before","visitors"]])


vis.to_csv("data/visitation_tidy_imputed.csv", index=False)

In [None]:
# cleaning climate data

# trim pre-2014 data
climate_filtered_df = climate_df[climate_df['Year'] >= 2014]

# get ONLY snow seasons for each year
climate_filtered_df = climate_filtered_df[
    ((climate_filtered_df['Month'] == 6) & (climate_filtered_df['Day'] >= 9)) |
    ((climate_filtered_df['Month'].isin([7,8]   ))) | 
    ((climate_filtered_df['Month'] == 9) & (climate_filtered_df['Day'] <= 21))
    ]

# datetime processing
climate_filtered_df['Date'] = pd.to_datetime(
    dict(year=2000, month=climate_filtered_df['Month'], day=climate_filtered_df['Day'])
)

# adding a week column for each date
start_date = pd.Timestamp(year=2000, month=6, day=9)
climate_filtered_df['DaysSinceJune9'] = (climate_filtered_df['Date'] - start_date).dt.days
climate_filtered_df['Week'] = (climate_filtered_df['DaysSinceJune9'] // 7) + 1
climate_filtered_df.drop(columns='DaysSinceJune9', inplace=True)
climate_filtered_df.drop(columns='Date', inplace=True)

# averaging over each week
week_av_clim_df = climate_filtered_df.groupby(
    ['Bureau of Meteorology station number', 'Year', 'Week']
    ) [[
        'Maximum temperature (Degree C)',
        'Minimum temperature (Degree C)',
        'Rainfall amount (millimetres)'
    ]].mean().reset_index()

# deal with remaining NaN that weren't removed by averaging over each week
# imputing on median (reduces impact of outlier weather conditions)  

cols_to_fill = [
    'Maximum temperature (Degree C)',
    'Minimum temperature (Degree C)',
    'Rainfall amount (millimetres)'
]

station_week_clim_avg = week_av_clim_df.groupby(['Bureau of Meteorology station number', 'Week'])[cols_to_fill].transform('median')
week_av_clim_df[cols_to_fill] = week_av_clim_df[cols_to_fill].fillna(station_week_clim_avg)

1210
