In [31]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.svm import SVC # "Support vector classifier"

In [32]:
# Method to load .csv files 
def load_data(filename):
    # Load data from CSV file
    data = pd.read_csv(filename)
    return data

# Data Cleaning

In [33]:
summary_weather = load_data("Summary of Weather.csv")
weather_stations = load_data("Weather Station Locations.csv")
weather_aus = load_data("weatherAUS.csv")

  data = pd.read_csv(filename)


### "Summary of Weather.csv" cleaning

In [34]:
# Fixing the Summary of Weather.csv
summary_weather['Date'] = pd.to_datetime(summary_weather['Date'], errors='coerce')

# Check for columns that contain mixed types by trying to convert all columns to float and catching exceptions
for column in summary_weather.columns:
    try:
        if(column != "Date"):
            summary_weather[column] = summary_weather[column].astype(float)

    except ValueError:
        print(f"Column {column} contains non-numeric values.")



Column Precip contains non-numeric values.
Column Snowfall contains non-numeric values.
Column PoorWeather contains non-numeric values.
Column PRCP contains non-numeric values.
Column SNF contains non-numeric values.
Column TSHDSBRSGF contains non-numeric values.


In [35]:
# Fixing/removing columns

# Precip column, replace 'T' with 0 values
summary_weather['Precip'] = summary_weather['Precip'].replace('T', 0)
# PoorWeather replace empty slots with 0
summary_weather = summary_weather.drop('PoorWeather', axis=1)
# PRCP replace 'T' with 0 values
summary_weather['PRCP'] = summary_weather['PRCP'].replace('T', 0)
# Drop Snowfall
summary_weather = summary_weather.drop('Snowfall', axis=1)
# SNF drop it
summary_weather = summary_weather.drop('SNF', axis=1)
# TSHDSBRSGF drop it
summary_weather = summary_weather.drop('TSHDSBRSGF', axis=1)
# STA drop it
summary_weather = summary_weather.drop('STA', axis=1)
# Drop Year
summary_weather = summary_weather.drop('YR', axis=1)
# Drop Day
summary_weather = summary_weather.drop('DA', axis=1)
# drop all empty columns, axis = 1 
summary_weather = summary_weather.dropna(axis=1, how='all')

In [36]:
for column in summary_weather.columns:
    try:
        if(column != "Date"):
            summary_weather[column] = summary_weather[column].astype(float)

    except ValueError:
        print(f"Column {column} contains non-numeric values.")

print("CSV fixed")

CSV fixed


### "Weather Station Locations.csv" cleaning

In [37]:
# Check for columns that contain mixed types
for column in weather_stations.columns:
    try:
        weather_stations[column] = weather_stations[column].astype(float)

    except ValueError:
        print(f"Column {column} contains non-numeric values.")

Column NAME contains non-numeric values.
Column STATE/COUNTRY ID contains non-numeric values.
Column LAT contains non-numeric values.
Column LON contains non-numeric values.


In [38]:
# Removing columns that are not necessary 

# Removing WBAN
weather_stations = weather_stations.drop('WBAN', axis=1)
# Removing NAME
weather_stations = weather_stations.drop('NAME', axis=1)
# Removing STATE/COUNTRY
weather_stations = weather_stations.drop('STATE/COUNTRY ID', axis=1)
# Removing LAT
weather_stations = weather_stations.drop('LAT', axis=1)
# Removing LON
weather_stations = weather_stations.drop('LON', axis=1)


In [39]:
for column in weather_stations.columns:
    try:
        weather_stations[column] = weather_stations[column].astype(float)

    except ValueError:
        print(f"Column {column} contains non-numeric values.")
        
print("CSV fixed")

CSV fixed


### "weatherAUS.csv" cleaning

In [40]:
# Check for columns that contain mixed types
for column in weather_aus.columns:
    try:
        weather_aus[column] = weather_aus[column].astype(float)

    except ValueError:
        print(f"Column {column} contains non-numeric values.")

Column Date contains non-numeric values.
Column Location contains non-numeric values.
Column WindGustDir contains non-numeric values.
Column WindDir9am contains non-numeric values.
Column WindDir3pm contains non-numeric values.
Column RainToday contains non-numeric values.
Column RainTomorrow contains non-numeric values.


In [41]:
# Removing columns that are not necessary 

# Removing Date
weather_aus = weather_aus.drop('Date', axis=1)
# Removing Location
weather_aus = weather_aus.drop('Location', axis=1)
# Removing WindGustDir
weather_aus = weather_aus.drop('WindGustDir', axis=1)
# Removing WindDir9am
weather_aus = weather_aus.drop('WindDir9am', axis=1)
# Removing WindDir3pm
weather_aus = weather_aus.drop('WindDir3pm', axis=1)

In [42]:
# using the RainToday and RainTomorrow as ground truths

# Convert 'Yes' to 1 and 'No' to 0 for 'RainToday'
weather_aus['RainToday'] = weather_aus['RainToday'].map({'Yes': 1, 'No': 0}).fillna(0)

# Convert 'Yes' to 1 and 'No' to 0 for 'RainTomorrow'
weather_aus['RainTomorrow'] = weather_aus['RainTomorrow'].map({'Yes': 1, 'No': 0}).fillna(0)

gt_today = weather_aus['RainToday']
gt_tomorrow = weather_aus['RainTomorrow']

# List of target columns to exclude
target_columns = ['RainToday', 'RainTomorrow']

weather_aus = weather_aus.drop(columns=target_columns)

In [44]:
for column in weather_aus.columns:
    try:
        weather_aus[column] = weather_aus[column].astype(float)

    except ValueError:
        print(f"Column {column} contains non-numeric values.")

print("CSV fixed")

CSV fixed


### Combining the CSV files into one Dataframe

In [47]:
combined_df = pd.concat([summary_weather, weather_stations, weather_aus], axis=1)
combined_df = combined_df.fillna(0)

# for the models
print(combined_df)
print(gt_today)
print(gt_tomorrow)

                       Date  Precip  WindGustSpd    MaxTemp    MinTemp  \
0       1942-07-01 00:00:00   1.016          0.0  25.555556  22.222222   
1       1942-07-02 00:00:00   0.000          0.0  28.888889  21.666667   
2       1942-07-03 00:00:00   2.540          0.0  26.111111  22.222222   
3       1942-07-04 00:00:00   2.540          0.0  26.666667  22.222222   
4       1942-07-05 00:00:00   0.000          0.0  26.666667  21.666667   
...                     ...     ...          ...        ...        ...   
145455                    0   0.000          0.0   0.000000   0.000000   
145456                    0   0.000          0.0   0.000000   0.000000   
145457                    0   0.000          0.0   0.000000   0.000000   
145458                    0   0.000          0.0   0.000000   0.000000   
145459                    0   0.000          0.0   0.000000   0.000000   

         MeanTemp   MO  PRCP   DR  SPD  ...  WindSpeed9am  WindSpeed3pm  \
0       23.888889  7.0  0.04  0.0  0