# Run

In [None]:
%pip install -r requirements.txt

In [1]:
import os
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from utils.load_dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
RAW_DIR = os.getenv("RAW_DIR")
DATASET_DIR = os.getenv("DATASETS_DIR")
PROCESSED_DIR = os.getenv("PROCESSED_DIR")

WEATHER_DATASET = output_path = os.path.join(DATASET_DIR, "weather_sj_2010_2017.csv")

### Data Processing

In [None]:
# Merging the raw data
if not os.path.exists(RAW_DIR):
    raise FileNotFoundError(f"Directory {RAW_DIR} not found")
    
if not os.path.exists(PROCESSED_DIR):
    os.makedirs(PROCESSED_DIR, exist_ok=True)

csv_files = glob.glob(os.path.join(RAW_DIR, "*.csv"))

dfs = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(dfs, ignore_index=True)
merged_df['datetime'] = pd.to_datetime(merged_df['datetime'])  # Adjust column name if different
merged_df = merged_df.sort_values('datetime')
merged_df.to_csv(PROCESSED_FILE, index=False)

print(f"Successfully merged CSVs saved to {output_path}")

### Data Validation

In [None]:
weather_df = pd.read_csv(WEATHER_DATASET)

In [None]:
weather_df.info(), weather_df.head()

In [None]:
# NULL values values
plt.figure(figsize=(15, 6))
sns.heatmap(weather_df.isnull(), cbar=False, cmap="viridis")
plt.title("NULL Values Heatmap")
plt.show()

In [None]:
# Checking for duplicate rows
duplicates = weather_df.duplicated().sum()
print(f"Number of duplicated rows: {duplicates}")

In [None]:
# Checking for duplicated dates
duplicate_dates = weather_df["datetime"].duplicated().sum()
print(f"Number of duplicated dates: {duplicate_dates}")

In [None]:
# Check for skipped dates
start_date = weather_df["datetime"].min()
end_date = weather_df["datetime"].max()

date_range = pd.date_range(start=start_date, end=end_date, freq="D")
missing_dates = date_range[~date_range.isin(pd.to_datetime(weather_df["datetime"]))]
missing_dates_list = missing_dates.strftime("%Y-%m-%d").tolist()

print(f"Missing dates: {len(missing_dates_list)}")

In [None]:
# Check for features with outliers

outlier_features = []
for col in weather_df.select_dtypes(include=["number"]).columns:
    q1 = weather_df[col].quantile(0.25)
    q3 = weather_df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    if ((weather_df[col] < lower_bound) | (weather_df[col] > upper_bound)).any():
        outlier_features.append(col)

print(f"Features with outliers: {outlier_features}")

plt.figure(figsize=(len(outlier_features), 5))  # Adjust width depending on number of features

plt.boxplot([weather_df[col].dropna() for col in outlier_features], tick_labels=outlier_features)
plt.xticks(rotation=45)
plt.title('Features with Outliers')
plt.ylabel('Value')
plt.grid(axis='y')

plt.tight_layout()
plt.show()