In [30]:
import pandas as pd
file_path = 'weatherAUS.csv'
Aus_weather = pd.read_csv(file_path)
Aus_weather.shape

(145460, 23)

In [40]:
Aus_weather['Date'] = pd.to_datetime(Aus_weather['Date'])

In [41]:
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Winter'

In [42]:
Aus_weather['Season'] = Aus_weather['Date'].apply(get_season)
df_spring = Aus_weather[Aus_weather['Season'] == 'Spring']
df_summer = Aus_weather[Aus_weather['Season'] == 'Summer']
df_autumn = Aus_weather[Aus_weather['Season'] == 'Autumn']
df_winter = Aus_weather[Aus_weather['Season'] == 'Winter']

In [43]:
len(df_spring), len(df_summer), len(df_autumn), len(df_winter)

(38264, 36737, 35337, 35122)

In [31]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [44]:
def preprocess_weather_data(df):
    # Impute missing values for numerical columns
    num_cols = df.select_dtypes(include=[np.number]).columns
    imputer_num = SimpleImputer(strategy='median')
    df[num_cols] = imputer_num.fit_transform(df[num_cols])

    # Drop certain categorical columns
    df = df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Date', 'Season'], axis=1)

    # Impute missing values for remaining categorical columns
    cat_cols = df.select_dtypes(include=['object']).columns
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

    # Encode categorical variables
    label_encoders = {}
    for column in cat_cols:
        label_encoder = LabelEncoder()
        df[column] = label_encoder.fit_transform(df[column].astype(str))
        label_encoders[column] = label_encoder

    return df, label_encoders

In [None]:
import pandas as pd
from imblearn.under_sampling import TomekLinks

In [48]:
def undersampling(data, target, i):
    data, _ = preprocess_weather_data(data)
    X = data.drop(target, axis=1)
    y = data[target]
    tl = TomekLinks()
    X_resampled, y_resampled = tl.fit_resample(X, y)
    resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
    resampled_data[target] = y_resampled
    resampled_data.to_csv(f'{i}.csv', index=False)


In [None]:
seasonable_datas = [df_spring, df_summer, df_autumn, df_winter]
for i in range(4):
    undersampling(seasonable_datas[i], 'RainTomorrow', i)

In [60]:
resampled_df = pd.read_csv('weatherAUS_resampled.csv')
resampled_df.shape
print(resampled_df[resampled_df['RainTomorrow']==1].shape)
print(resampled_df[resampled_df['RainTomorrow']==0].shape)

(31877, 19)
(107654, 19)


In [61]:
X = resampled_df.drop(['RainTomorrow'], axis=1)
y = resampled_df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [62]:
best_params = {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
model = RandomForestClassifier(**best_params,random_state=42)
# model.fit(X_train_pca, y_train)
# y_pred_pca = model.predict(X_test_pca)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# accuracy_pca = accuracy_score(y_test, y_pred_pca)
# classification_rep_pca = classification_report(y_test, y_pred_pca)
accuracy_pca = accuracy_score(y_test, y_pred)
classification_rep_pca = classification_report(y_test, y_pred)
print(classification_rep_pca)

              precision    recall  f1-score   support

           0       0.87      0.95      0.91     21436
           1       0.77      0.53      0.63      6471

    accuracy                           0.86     27907
   macro avg       0.82      0.74      0.77     27907
weighted avg       0.85      0.86      0.85     27907



In [77]:
import xgboost as xgb

In [78]:
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=200, learning_rate=0.1, max_depth=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# model.fit(X_train_pca, y_train)
# y_pred = model.predict(X_test_pca)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep_pca = classification_report(y_test, y_pred)
print(classification_rep_pca)

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      5397
           1       0.77      0.52      0.62      1371

    accuracy                           0.87      6768
   macro avg       0.83      0.74      0.77      6768
weighted avg       0.86      0.87      0.86      6768



In [72]:
spring_df = pd.read_csv('3.csv')
print(spring_df[spring_df['RainTomorrow']==1].shape)
print(spring_df[spring_df['RainTomorrow']==0].shape)

(6955, 18)
(26884, 18)


In [73]:
X = spring_df.drop(['RainTomorrow'], axis=1)
y = spring_df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [74]:
best_params = {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
model = RandomForestClassifier(**best_params,random_state=42)
# model.fit(X_train_pca, y_train)
# y_pred_pca = model.predict(X_test_pca)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# accuracy_pca = accuracy_score(y_test, y_pred_pca)
# classification_rep_pca = classification_report(y_test, y_pred_pca)
accuracy_pca = accuracy_score(y_test, y_pred)
classification_rep_pca = classification_report(y_test, y_pred)
print(classification_rep_pca)

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      5397
           1       0.76      0.48      0.58      1371

    accuracy                           0.86      6768
   macro avg       0.82      0.72      0.75      6768
weighted avg       0.85      0.86      0.85      6768



In [76]:
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=200, learning_rate=0.1, max_depth=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# model.fit(X_train_pca, y_train)
# y_pred = model.predict(X_test_pca)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep_pca = classification_report(y_test, y_pred)
print(classification_rep_pca)

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      5397
           1       0.77      0.52      0.62      1371

    accuracy                           0.87      6768
   macro avg       0.83      0.74      0.77      6768
weighted avg       0.86      0.87      0.86      6768

