In [1]:
import pandas as pd

# Load the dataset
file_path = 'weatherAUS.csv'
# Loading the dataset again to work with the original 'Location' values (string names)
weather_data = pd.read_csv(file_path)



In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# Preprocessing steps for the whole dataset

# # Dropping 'row ID' as it's not needed for model training
# weather_data = weather_data.drop(['row ID'], axis=1)

# Handling missing values for numerical columns
num_cols = weather_data.select_dtypes(include=[np.number]).columns
imputer_num = SimpleImputer(strategy='median')
weather_data[num_cols] = imputer_num.fit_transform(weather_data[num_cols])

# Handling missing values for categorical columns
# For now, we will drop columns with a high percentage of missing values
weather_data = weather_data.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)
cat_cols = weather_data.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
weather_data[cat_cols] = imputer_cat.fit_transform(weather_data[cat_cols])



# Encoding categorical variables
label_encoders = {}
for column in cat_cols:
    label_encoder = LabelEncoder()
    weather_data[column] = label_encoder.fit_transform(weather_data[column].astype(str))
    label_encoders[column] = label_encoder

# Adjusting the provided code to extract data for all coastal cities in Australia

# First, let's define the coastal cities as per the original dataset
coastal_cities = ['Adelaide', 'Albany', 'Brisbane', 'Cairns', 'CoffsHarbour', 
                  'Darwin', 'GoldCoast', 'Hobart', 'Melbourne', 'MelbourneAirport',
                  'Newcastle', 'NorahHead', 'NorfolkIsland', 'Perth', 'PerthAirport',
                  'Portland', 'Sydney', 'SydneyAirport', 'Townsville', 'Williamtown', 'Wollongong']

# Now, let's extract the encoded values for these coastal cities
encoded_coastal_cities = {label_encoders['Location'].transform([city])[0] for city in coastal_cities if city in label_encoders['Location'].classes_}

# Extracting data for these coastal cities
coastal_data = weather_data[weather_data['Location'].isin(encoded_coastal_cities)]

# Overview of the processed coastal cities data
coastal_data.head()




Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
9058,427,11,16.1,31.4,0.0,5,54.0,3,5,7.0,37.0,51.0,58.0,1005.9,1002.3,26.5,28.4,0,0
9059,428,11,22.8,24.7,0.0,11,56.0,11,8,35.0,15.0,68.0,67.0,1010.9,1011.4,23.4,24.4,0,1
9060,429,11,20.0,24.1,4.6,9,35.0,9,0,20.0,19.0,70.0,59.0,1019.3,1018.8,21.7,23.7,1,0
9061,430,11,14.8,25.0,0.8,0,24.0,10,2,7.0,17.0,62.0,45.0,1019.5,1017.0,22.5,24.8,0,0
9062,431,11,15.5,27.3,0.0,4,41.0,7,4,7.0,30.0,54.0,62.0,1015.7,1012.7,24.6,26.1,0,0


In [3]:
# Mapping the encoded integer values back to their original string labels in 'Location' column
location_mappings = {index: label for index, label in enumerate(label_encoders['Location'].classes_)}
# location_mappings


In [4]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into features (X) and target variable (y)
X = coastal_data.drop(columns=['RainTomorrow'])
y = coastal_data['RainTomorrow']

# Splitting the data into training, validation, and test sets
# Using 70% of data for training, 15% for validation, and 15% for testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

# Sizes of each dataset
sizes = {
    "Training set size": X_train.shape[0],
    "Validation set size": X_val.shape[0],
    "Test set size": X_test.shape[0]
}

sizes


{'Training set size': 45365,
 'Validation set size': 9721,
 'Test set size': 9722}

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Initialize Logistic Regression model
logreg = LogisticRegression(solver='liblinear', random_state=42)

# Parameters for tuning
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # Regularization parameter
    'penalty': ['l1', 'l2'] # Norm used in the penalization
}

# Grid search with cross-validation for parameter tuning
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_val, y_val)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train the model with the best parameters on the full training set
logreg_optimized = LogisticRegression(**best_params, solver='liblinear', random_state=42)
logreg_optimized.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg_optimized.predict(X_test)

# Evaluate the optimized model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Printing the best parameters found by Grid Search
print("Best Parameters from Grid Search:", best_params)

# Printing the best score achieved on the validation set during Grid Search
print("Best Score from Grid Search on Validation Set:", best_score)

# Printing the accuracy of the optimized model on the test set
print("Accuracy of Optimized Model on Test Set:", accuracy)

# Printing the detailed classification report for the test set predictions
print("Classification Report for Test Set:")
print(classification_rep)



In [7]:
# Assuming best_params are already known from your previous GridSearch
best_params = {'C': 0.1, 'penalty': 'l1'}  # Replace with your actual best parameters

# Splitting the dataset into features (X) and target variable (y)
X = coastal_data.drop(columns=['RainTomorrow'])
y = coastal_data['RainTomorrow'].astype('int')  # Ensure y is of integer type

# Splitting the data into a 70% training set and a 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the Logistic Regression model with the best parameters
logreg_optimized = LogisticRegression(**best_params, solver='liblinear', random_state=42)
logreg_optimized.fit(X_train, y_train)

# Predicting on the new test set
y_pred = logreg_optimized.predict(X_test)

# Evaluating the model on the new test set
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Printing the results
print("Accuracy of Optimized Model on New Test Set:", accuracy)
print("Classification Report for New Test Set:")
print(classification_rep)


Accuracy of Optimized Model on New Test Set: 0.8133262823902697
Classification Report for New Test Set:
              precision    recall  f1-score   support

           0       0.83      0.94      0.88      9765
           1       0.73      0.46      0.56      3472

    accuracy                           0.81     13237
   macro avg       0.78      0.70      0.72     13237
weighted avg       0.80      0.81      0.80     13237



In [8]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree Classifier
dtree = DecisionTreeClassifier(random_state=42)

# Parameters for tuning
param_grid_dtree = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search with cross-validation for parameter tuning
grid_search_dtree = GridSearchCV(dtree, param_grid_dtree, cv=5, scoring='accuracy')
grid_search_dtree.fit(X_val, y_val)

# Best parameters and best score
best_params_dtree = grid_search_dtree.best_params_
best_score_dtree = grid_search_dtree.best_score_

# Train the model with the best parameters on the full training set
dtree_optimized = DecisionTreeClassifier(**best_params_dtree, random_state=42)
dtree_optimized.fit(X_train, y_train)

# Predict on the test set
y_pred_dtree = dtree_optimized.predict(X_test)

# Evaluate the optimized model
accuracy_dtree = accuracy_score(y_test, y_pred_dtree)
classification_rep_dtree = classification_report(y_test, y_pred_dtree)

# Printing the best parameters found by Grid Search
print("Best Parameters from Grid Search (Decision Tree):", best_params_dtree)

# Printing the best score achieved on the validation set during Grid Search
print("Best Score from Grid Search on Validation Set (Decision Tree):", best_score_dtree)

# Printing the accuracy of the optimized model on the test set
print("Accuracy of Optimized Decision Tree Model on Test Set:", accuracy_dtree)

# Printing the detailed classification report for the test set predictions
print("Classification Report for Test Set (Decision Tree):")
print(classification_rep_dtree)

Best Parameters from Grid Search (Decision Tree): {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Score from Grid Search on Validation Set (Decision Tree): 0.7796931125588873
Accuracy of Optimized Decision Tree Model on Test Set: 0.7995013975976429
Classification Report for Test Set (Decision Tree):
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      9765
           1       0.67      0.46      0.55      3472

    accuracy                           0.80     13237
   macro avg       0.75      0.69      0.71     13237
weighted avg       0.79      0.80      0.79     13237



In [10]:
best_params = {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}  # Replace with your actual best parameters

# Splitting the dataset into features (X) and target variable (y)
X = coastal_data.drop(columns=['RainTomorrow'])
y = coastal_data['RainTomorrow'].astype('int')  # Ensure y is of integer type

# Splitting the data into a 70% training set and a 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the Logistic Regression model with the best parameters
logreg_optimized = DecisionTreeClassifier(**best_params, random_state=42)
logreg_optimized.fit(X_train, y_train)

# Predicting on the new test set
y_pred = logreg_optimized.predict(X_test)

# Evaluating the model on the new test set
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Printing the results
print("Accuracy of Optimized Model on New Test Set:", accuracy)
print("Classification Report for New Test Set:")
print(classification_rep)

Accuracy of Optimized Model on New Test Set: 0.7995013975976429
Classification Report for New Test Set:
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      9765
           1       0.67      0.46      0.55      3472

    accuracy                           0.80     13237
   macro avg       0.75      0.69      0.71     13237
weighted avg       0.79      0.80      0.79     13237



In [11]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Parameters for tuning
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search with cross-validation for parameter tuning
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_val, y_val)

# Best parameters and best score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

# Train the model with the best parameters on the full training set
rf_optimized = RandomForestClassifier(**best_params_rf, random_state=42)
rf_optimized.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_optimized.predict(X_test)

# Evaluate the optimized model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

# Printing the best parameters found by Grid Search
print("Best Parameters from Grid Search (Decision Tree):", best_params_rf)

# Printing the best score achieved on the validation set during Grid Search
print("Best Score from Grid Search on Validation Set (Decision Tree):", best_score_rf)

# Printing the accuracy of the optimized model on the test set
print("Accuracy of Optimized Decision Tree Model on Test Set:", accuracy_rf)

# Printing the detailed classification report for the test set predictions
print("Classification Report for Test Set (Decision Tree):")
print(classification_rep_rf)


Best Parameters from Grid Search (Decision Tree): {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Score from Grid Search on Validation Set (Decision Tree): 0.8159555665166367
Accuracy of Optimized Decision Tree Model on Test Set: 0.8261690715418901
Classification Report for Test Set (Decision Tree):
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      9765
           1       0.75      0.50      0.60      3472

    accuracy                           0.83     13237
   macro avg       0.80      0.72      0.75     13237
weighted avg       0.82      0.83      0.81     13237



In [12]:
best_params = {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100} # Replace with your actual best parameters

# Splitting the dataset into features (X) and target variable (y)
X = coastal_data.drop(columns=['RainTomorrow'])
y = coastal_data['RainTomorrow'].astype('int')  # Ensure y is of integer type

# Splitting the data into a 70% training set and a 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the Logistic Regression model with the best parameters
logreg_optimized = RandomForestClassifier(**best_params, random_state=42)
logreg_optimized.fit(X_train, y_train)

# Predicting on the new test set
y_pred = logreg_optimized.predict(X_test)

# Evaluating the model on the new test set
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Printing the results
print("Accuracy of Optimized Model on New Test Set:", accuracy)
print("Classification Report for New Test Set:")
print(classification_rep)

Accuracy of Optimized Model on New Test Set: 0.8261690715418901
Classification Report for New Test Set:
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      9765
           1       0.75      0.50      0.60      3472

    accuracy                           0.83     13237
   macro avg       0.80      0.72      0.75     13237
weighted avg       0.82      0.83      0.81     13237



In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Assuming coastal_data is your preprocessed dataset
X = coastal_data.drop(['RainTomorrow'], axis=1)
y = coastal_data['RainTomorrow']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Building the revised model
model = Sequential()
model.add(Dense(128, input_shape=(X_train_scaled.shape[1],), activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for regularization
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))  # Another dropout layer
model.add(Dense(1, activation='sigmoid'))

# Compiling the model with a different optimizer and learning rate
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Training the model with more epochs and early stopping
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, 
                    validation_data=(X_val_scaled, y_val), callbacks=[early_stopping])

# Evaluate the revised model
loss, accuracy = model.evaluate(X_test_scaled, y_test)

# Predictions on the test set
y_pred_probs = model.predict(X_test_scaled)
y_pred_classes = (y_pred_probs > 0.5).astype(int)

# Compute classification report
classification_rep = classification_report(y_test, y_pred_classes)

# Print the accuracy and the detailed classification report
print("Test Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


ModuleNotFoundError: No module named 'tensorflow'

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming coastal_data is your preprocessed dataset
X = coastal_data.drop(['RainTomorrow'], axis=1)
y = coastal_data['RainTomorrow']

# Split the data into training and test sets (70% training, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.astype(np.float32))
y_train_tensor = torch.tensor(y_train.values.astype(np.float32))
X_test_tensor = torch.tensor(X_test_scaled.astype(np.float32))
y_test_tensor = torch.tensor(y_test.values.astype(np.float32))

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model definition
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(X_train_scaled.shape[1], 128)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.sigmoid(self.fc3(x))
        return x

model = NeuralNet()

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()  # Adjusting the output shape
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
y_pred = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs).squeeze()  # Adjusting the output shape
        y_pred.extend(outputs.round().numpy())

# Convert predictions to a format suitable for classification report
y_pred = np.array(y_pred).flatten()

# Compute classification report
classification_rep = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(classification_rep)


Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88      4860
         1.0       0.75      0.46      0.57      1759

    accuracy                           0.82      6619
   macro avg       0.79      0.70      0.73      6619
weighted avg       0.81      0.82      0.80      6619

