In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


url = "https://raw.githubusercontent.com/Insights-Crafter/csvfiles/refs/heads/main/weatherAUS.csv"
df = pd.read_csv(url)
rain_df = pd.DataFrame(df)


rain_df.drop(['Date'], axis=1, inplace=True)


city_direction_map = {
    'Brisbane': 'East', 'Cairns': 'Northeast', 'GoldCoast': 'East', 'Townsville': 'Northeast',
    'CoffsHarbour': 'East', 'Newcastle': 'East', 'NorahHead': 'East', 'NorfolkIsland': 'East',
    'Sydney': 'East', 'SydneyAirport': 'East', 'Penrith': 'East', 'Richmond': 'East',
    'Wollongong': 'East', 'Williamtown': 'East', 'Albury': 'Southeast', 'BadgerysCreek': 'Southeast',
    'WaggaWagga': 'Southeast', 'Canberra': 'Southeast', 'Tuggeranong': 'Southeast',
    'MountGinini': 'Southeast', 'Ballarat': 'Southeast', 'Bendigo': 'Southeast',
    'Sale': 'Southeast', 'Melbourne': 'Southeast', 'MelbourneAirport': 'Southeast',
    'Watsonia': 'Southeast', 'Dartmoor': 'Southeast', 'MountGambier': 'South', 'Nhil': 'South',
    'Portland': 'South', 'Adelaide': 'South', 'Nuriootpa': 'South', 'Woomera': 'South',
    'Perth': 'West', 'PerthAirport': 'West', 'PearceRAAF': 'West', 'Albany': 'Southwest',
    'Walpole': 'Southwest', 'Witchcliffe': 'Southwest', 'SalmonGums': 'West',
    'Darwin': 'North', 'Katherine': 'North', 'AliceSprings': 'Central', 'Uluru': 'Central',
    'Cobar': 'West', 'Moree': 'Northwest'
}
rain_df['Region'] = rain_df['Location'].map(city_direction_map)


rain_df['RainTomorrow'] = rain_df['RainTomorrow'].apply(lambda x: 1 if str(x).strip().lower() == 'yes' else 0)
rain_df['RainToday'] = rain_df['RainToday'].apply(lambda x: 1 if str(x).strip().lower() == 'yes' else 0)

target = rain_df['RainTomorrow']
features = rain_df.drop(columns=['RainTomorrow'])


numerical_cols = features.select_dtypes(include='number')
categorical_cols = features.select_dtypes(include='object')


num_imputer = SimpleImputer(strategy='mean')
numerical_cols_imputed = pd.DataFrame(num_imputer.fit_transform(numerical_cols),
                                      columns=numerical_cols.columns,
                                      index=numerical_cols.index)

cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols_imputed = pd.DataFrame(cat_imputer.fit_transform(categorical_cols),
                                        columns=categorical_cols.columns,
                                        index=categorical_cols.index)


encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical_cols = encoder.fit_transform(categorical_cols_imputed)
encoded_categorical_cols = pd.DataFrame(encoded_categorical_cols,
                                        columns=encoder.get_feature_names_out(categorical_cols_imputed.columns),
                                        index=categorical_cols_imputed.index)


scaler = StandardScaler()
scaled_numerical_cols = pd.DataFrame(scaler.fit_transform(numerical_cols_imputed),
                                     columns=numerical_cols_imputed.columns,
                                     index=numerical_cols_imputed.index)

final_df = pd.concat([scaled_numerical_cols, encoded_categorical_cols], axis=1)


X_temp, X_test, y_temp, y_test = train_test_split(final_df, target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)


log_reg = LogisticRegression(max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

grid_cv = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_cv.fit(X_train, y_train)


print("Best Hyperparameters:", grid_cv.best_params_)
print("Best Cross-Validation Accuracy:", grid_cv.best_score_)


best_model = grid_cv.best_estimator_


val_preds = best_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))


test_preds = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))


Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Cross-Validation Accuracy: 0.8484233767543147
Validation Accuracy: 0.8464526330262615
Test Accuracy: 0.847827581465695
Test Confusion Matrix:
[[21500  1172]
 [ 3255  3165]]


In [None]:
categorical_cols.head()

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,Region
0,Albury,W,W,WNW,Southeast
1,Albury,WNW,NNW,WSW,Southeast
2,Albury,WSW,W,WSW,Southeast
3,Albury,NE,SE,E,Southeast
4,Albury,W,ENE,NW,Southeast


In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Columns: 118 entries, MinTemp to Region_West
dtypes: float64(118)
memory usage: 131.0 MB
