In [2]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Input and output folders
input_folder = "data\All"
output_folder = "data\cleaned_All"

# Make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Columns to drop (0-indexed, i.e., 1st=0, 2nd=1, etc.)
cols_to_drop = [0, 1, 4, 6, 8, 9]

# Loop through all CSV files in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_folder, file_name)
        df = pd.read_csv(file_path)
        
        # Drop specified columns
        df.drop(df.columns[cols_to_drop], axis=1, inplace=True)
        
        # # Basic preprocessing
        # # 1. Convert categorical columns to category dtype
        # for col in df.select_dtypes(include='object').columns:
        #     df[col] = df[col].astype('category')
        
        # # 2. Standardize numeric columns (optional)
        # numeric_cols = df.select_dtypes(include='number').columns
        # if len(numeric_cols) > 0:
        #     scaler = StandardScaler()
        #     df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
        
        # Save cleaned CSV to output folder
        output_path = os.path.join(output_folder, file_name)
        df.to_csv(output_path, index=False)

print("Processing complete. Cleaned files are saved in:", output_folder)


  input_folder = "data\All"
  output_folder = "data\cleaned_All"


Processing complete. Cleaned files are saved in: data\cleaned_All


In [5]:
import os
import pandas as pd

# Input and output folders
input_folder = r"data\cleaned_station_data"
output_folder = r"data\cleaned_station_combined_data"

input_folders = ["-27.75_151.95", "-27.60_151.95", "-27.45_151.95",
                 "-27.90_152.10", "-27.75_152.10", "-27.60_152.10",
                 "-27.45_152.10", "-27.90_152.25", "-27.75_152.25",
                 "-27.60_152.25", "-27.45_152.25", "-28.05_152.40",
                 "-27.90_152.40", "-27.75_152.40", "-27.60_152.40",
                 "-27.45_152.40", "-27.60_152.55", "-27.45_152.55"]

# Make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop through each folder
for folder_name in input_folders:
    folder_path = os.path.join(input_folder, folder_name)
    combined_df = None
    num_files = 0
    
    if os.path.exists(folder_path):
        csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
        csv_files.sort()  # Optional: ensures consistent order
        
        for file_name in csv_files:
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            num_files += 1
            if combined_df is None:
                combined_df = df.copy()
            else:
                combined_df[df.select_dtypes(include='number').columns] += df.select_dtypes(include='number')
        
        if combined_df is not None and num_files > 0:
            numeric_cols = combined_df.select_dtypes(include='number').columns
            combined_df[numeric_cols] = combined_df[numeric_cols] / num_files
            
            output_file = os.path.join(output_folder, f"{folder_name}_combined.csv")
            combined_df.to_csv(output_file, index=False)
            print(f"Combined CSV for folder '{folder_name}' saved to '{output_file}'")
    else:
        print(f"Folder '{folder_name}' does not exist.")


Combined CSV for folder '-27.75_151.95' saved to 'data\cleaned_station_combined_data\-27.75_151.95_combined.csv'
Combined CSV for folder '-27.60_151.95' saved to 'data\cleaned_station_combined_data\-27.60_151.95_combined.csv'
Combined CSV for folder '-27.45_151.95' saved to 'data\cleaned_station_combined_data\-27.45_151.95_combined.csv'
Combined CSV for folder '-27.90_152.10' saved to 'data\cleaned_station_combined_data\-27.90_152.10_combined.csv'
Combined CSV for folder '-27.75_152.10' saved to 'data\cleaned_station_combined_data\-27.75_152.10_combined.csv'
Combined CSV for folder '-27.60_152.10' saved to 'data\cleaned_station_combined_data\-27.60_152.10_combined.csv'
Combined CSV for folder '-27.45_152.10' saved to 'data\cleaned_station_combined_data\-27.45_152.10_combined.csv'
Combined CSV for folder '-27.90_152.25' saved to 'data\cleaned_station_combined_data\-27.90_152.25_combined.csv'
Combined CSV for folder '-27.75_152.25' saved to 'data\cleaned_station_combined_data\-27.75_152.

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from scipy import stats

# Load dataset (replace with your file path)
df = pd.read_csv(r"data\1958-2024\-27.45_151.95_combined.csv")

# --- 1. Check for missing values ---
print("Missing values per column:")
print(df.isnull().sum())

# --- 2. Detect anomalies using Z-score ---
# Only consider numeric columns
numeric_df = df.select_dtypes(include=[np.number])

z_scores = np.abs(stats.zscore(numeric_df))
threshold = 3  # Common threshold for anomalies
anomalies_zscore = (z_scores > threshold)

print("\nAnomalies detected using Z-score:")
print(anomalies_zscore.sum())

# --- 3. Detect anomalies using Isolation Forest ---
iso = IsolationForest(contamination=0.05, random_state=42)
pred = iso.fit_predict(numeric_df)

# -1 = anomaly, 1 = normal
df['anomaly_isoforest'] = pred

print("\nIsolation Forest anomaly counts:")
print(df['anomaly_isoforest'].value_counts())

# Save anomalies to a new CSV
df[df['anomaly_isoforest'] == -1].to_csv("anomalies.csv", index=False)
print("\nAnomalies saved to anomalies.csv")


Missing values per column:
YYYY-MM-DD    0
daily_rain    0
max_temp      0
min_temp      0
dtype: int64

Anomalies detected using Z-score:
550

Isolation Forest anomaly counts:
anomaly_isoforest
 1    23248
-1     1224
Name: count, dtype: int64

Anomalies saved to anomalies.csv
