In [322]:
import pandas as pd

In [323]:
file_path = 'weatherAUS.csv'
Aus_weather = pd.read_csv(file_path)

In [324]:
# coastal_cities = ['Adelaide', 'Albany', 'Brisbane', 'Cairns', 'CoffsHarbour', 
#                   'Darwin', 'GoldCoast', 'Hobart', 'Melbourne', 'MelbourneAirport',
#                   'Newcastle', 'NorahHead', 'NorfolkIsland', 'Perth', 'PerthAirport',
#                   'Portland', 'Sydney', 'SydneyAirport', 'Townsville', 'Williamtown', 'Wollongong']
# Aus_weather = Aus_weather[Aus_weather['Location'].isin(coastal_cities)]

In [325]:
Aus_weather.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [326]:
Aus_weather['Date'] = pd.to_datetime(Aus_weather['Date'])

In [327]:
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Winter'

In [328]:
Aus_weather['Season'] = Aus_weather['Date'].apply(get_season)
df_spring = Aus_weather[Aus_weather['Season'] == 'Spring']
df_summer = Aus_weather[Aus_weather['Season'] == 'Summer']
df_autumn = Aus_weather[Aus_weather['Season'] == 'Autumn']
df_winter = Aus_weather[Aus_weather['Season'] == 'Winter']

In [329]:
len(df_spring), len(df_summer), len(df_autumn), len(df_winter)

(38264, 36737, 35337, 35122)

In [330]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [331]:
def preprocess_weather_data(df):
    # Impute missing values for numerical columns
    num_cols = df.select_dtypes(include=[np.number]).columns
    imputer_num = SimpleImputer(strategy='median')
    df[num_cols] = imputer_num.fit_transform(df[num_cols])

    # Drop certain categorical columns
    df = df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Date', 'Season'], axis=1)

    # Impute missing values for remaining categorical columns
    cat_cols = df.select_dtypes(include=['object']).columns
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

    # Encode categorical variables
    label_encoders = {}
    for column in cat_cols:
        label_encoder = LabelEncoder()
        df[column] = label_encoder.fit_transform(df[column].astype(str))
        label_encoders[column] = label_encoder

    return df, label_encoders
processed_weather_data, encoders = preprocess_weather_data(df_winter)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [332]:
processed_weather_data.columns

Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustDir',
       'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [333]:
# processed_weather_data.drop(['MinTemp','MaxTemp','WindGustDir','WindDir3pm','WindDir9am','Pressure9am',
#        'Pressure3pm','Location'],axis=1)
processed_weather_data.drop(['Location'],axis=1)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,13,44.0,13,14,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,0
1,7.4,25.1,0.0,14,44.0,6,15,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,0
2,12.9,25.7,0.0,15,46.0,13,15,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,0
3,9.2,28.0,0.0,4,24.0,9,0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,0
4,17.5,32.3,1.0,13,41.0,1,7,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145338,19.3,36.7,0.0,9,50.0,0,9,30.0,17.0,29.0,15.0,1019.4,1014.6,26.3,35.0,0,0
145339,18.1,35.9,0.0,0,43.0,2,1,26.0,20.0,29.0,16.0,1017.2,1012.9,25.8,34.3,0,0
145340,19.3,37.5,0.0,2,37.0,0,1,24.0,17.0,23.0,13.0,1013.7,1009.4,27.1,36.3,0,0
145341,17.5,38.0,0.0,10,37.0,0,11,28.0,11.0,18.0,10.0,1013.5,1009.6,26.8,36.8,0,0


In [334]:
X = processed_weather_data.drop(['RainTomorrow'], axis=1)
y = processed_weather_data['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [335]:
best_params = {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
model = RandomForestClassifier(**best_params,random_state=42)
# model.fit(X_train_pca, y_train)
# y_pred_pca = model.predict(X_test_pca)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# accuracy_pca = accuracy_score(y_test, y_pred_pca)
# classification_rep_pca = classification_report(y_test, y_pred_pca)
accuracy_pca = accuracy_score(y_test, y_pred)
classification_rep_pca = classification_report(y_test, y_pred)
print(classification_rep_pca)

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      5676
           1       0.73      0.45      0.55      1349

    accuracy                           0.86      7025
   macro avg       0.80      0.70      0.74      7025
weighted avg       0.85      0.86      0.85      7025



In [336]:
import xgboost as xgb

In [337]:
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=200, learning_rate=0.1, max_depth=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# model.fit(X_train_pca, y_train)
# y_pred = model.predict(X_test_pca)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep_pca = classification_report(y_test, y_pred)
print(classification_rep_pca)

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      5676
           1       0.72      0.47      0.57      1349

    accuracy                           0.86      7025
   macro avg       0.80      0.72      0.75      7025
weighted avg       0.85      0.86      0.85      7025



In [338]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming coastal_data is your preprocessed dataset


# Split the data into training and test sets (70% training, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.astype(np.float32))
y_train_tensor = torch.tensor(y_train.values.astype(np.float32))
X_test_tensor = torch.tensor(X_test_scaled.astype(np.float32))
y_test_tensor = torch.tensor(y_test.values.astype(np.float32))

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model definition
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(X_train_scaled.shape[1], 128)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.sigmoid(self.fc3(x))
        return x

model = NeuralNet()

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()  # Adjusting the output shape
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
y_pred = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs).squeeze()  # Adjusting the output shape
        y_pred.extend(outputs.round().numpy())

# Convert predictions to a format suitable for classification report
y_pred = np.array(y_pred).flatten()

# Compute classification report
classification_rep = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(classification_rep)
