<center> <h3> Weather Classification Task - Mahmoud Khaled </h3> </center>

#### Import Needed Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings #ignore warnings
warnings.filterwarnings('ignore') 
%matplotlib inline

### Data Investigation

In [None]:
df = pd.read_csv("weatherAUS.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
#drop duplicated records
df.drop_duplicates(keep = 'first', inplace = True)

In [None]:
#change the dates which currently coded as strings into datetime format
df['Date'] = pd.to_datetime(df['Date'])     

In [None]:
# df['Year'] = df['Date'].dt.year          # extract year (I think it is not important like month in terms of weather) - convert to comment not run
df['Month'] = df['Date'].dt.month          # extract month
df['Month'] = df['Month'].astype('object') # You should add Month as object
# df['Day'] = df['Date'].dt.day            # extract day (I think it is not important like month in terms of weather) - convert to comment not run
df.drop(columns = ['Date'], inplace = True) 


##### Categorical and Numerical Splitting

In [None]:
Categorical_Features = df.select_dtypes(include = ['object']).columns.tolist() # list the categorical features
Numerical_Features = df.select_dtypes(include = ['int', 'float']).columns.tolist() # list the numerical features
print('Categorical_Features are: ', Categorical_Features)
print('Numerical_Features are: ', Numerical_Features)

##### Check Inconsistency for all categorical features

In [None]:
for col in Categorical_Features:
    frequency_table = df[col].value_counts()
    print(frequency_table)
    print('--------------------------------------------------------------------------------')

##### Remove missing from the target variable

In [None]:
# Remove missing from the target variable
df.dropna(subset = ['RainTomorrow'] , inplace = True)

In [None]:
df.isnull().sum()

##### Descriptive Statistics of all Numerical Features

In [None]:
round(df.describe().T, 2)

#### Ouliers Detection and Treatment

##### BoxPlots for Numerical Features to check Outliers

In [None]:
# Set the style
sns.set_style('dark')

# Define the number of rows and columns for subplots
rows = 6
cols = 3

# Create the subplots
fig, axis = plt.subplots(rows, cols, sharey = True, figsize = (15, 7))
fig.suptitle('Boxplots for all Numerical Features', size = 25)

# Flatten the axes array for easy indexing
axis = axis.flatten()

# Loop through the columns list and plot boxplots
for i, feature in enumerate(Numerical_Features):
    sns.boxplot(df[feature], orient = 'h', color = 'lightblue', ax = axis[i])
    axis[i].set_title(feature)

# Hide unused axes
for j in range(i + 1, len(axis)):  # Starting from the next index of the last used axis
    fig.delaxes(axis[j])  # Remove unused axes

# Adjust layout to avoid overlap
plt.tight_layout()

# Show the plot
plt.show()


##### Outliers Treatment for all Numerical Features by Winsorization 

In [None]:
for Feature in Numerical_Features:
    Q1,Q3 = np.nanpercentile(df[Feature], [25,75]) #or you can use ----->>     np.quantile(data , [0.25,0.75]
    IQR = Q3 - Q1
    lowerbound = Q1 - (1.5 * IQR)
    upperbound = Q3 + (1.5 * IQR)
    for i in df[Feature]:
        if (i < lowerbound or i > upperbound):
            if i > upperbound:
                df.loc[(df[Feature] == i), Feature] = np.nanpercentile(df[Feature], 90)
            elif i < lowerbound:
                df.loc[(df[Feature] == i), Feature] = np.nanpercentile(df[Feature], 10)
            else:
                pass

#### Missing Treatment by Average for Numerical and Mode for Categorical

In [None]:
# You can loop through all numerical variables
for Feature in Numerical_Features:
    df[Feature].fillna(value = df[Feature].mean(), inplace = True)

In [None]:
# You can loop through all categorical variables
for Feature in Categorical_Features:
    df[Feature].fillna(value = df[Feature].mode()[0], inplace = True)

In [None]:
df.isnull().sum()

#### Data Splitting to X and y

In [None]:
# Removing the target variable from feature variables
X = df.drop(columns = ['RainTomorrow'])
y = df['RainTomorrow']  # Ensure y is a 1D array
# Relabel the 'RainTomorrow' column: 'yes' -> 1, 'no' -> 0
y = df['RainTomorrow'].replace({'Yes': 1, 'No': 0})
y

#### Data Transformation by ColumnTransformer

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder 

# Define categorical and numerical for only X Features
Categorical_variables = X.select_dtypes(include = ['object']).columns.tolist() # list the categorical features
Numerical_variables = X.select_dtypes(include = ['int', 'float']).columns.tolist() # list the numerical features


# Define the transformers
Numerical_transformer = MinMaxScaler()
Categorical_transformer = OneHotEncoder(drop = 'first', handle_unknown = 'ignore')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(transformers = [('num', Numerical_transformer, Numerical_variables),
                                                 ('cat', Categorical_transformer, Categorical_variables)])

# Fit and transform the data
Transformed_X = preprocessor.fit_transform(X).toarray()
Transformed_X
# Convert the result back to a DataFrame
# Get feature names for the OneHotEncoded columns
encoded_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(Categorical_variables)
encoded_feature_names
# Combine numerical and encoded categorical feature names
feature_names = list(Numerical_variables) + list(encoded_feature_names)

Transformed_X = pd.DataFrame(Transformed_X, columns = feature_names)

# Now 'Transformed_X' contains both normalized numerical features and one-hot encoded categorical features
Transformed_X

##### Data Splitting to Train and Test

In [None]:
from sklearn.model_selection import train_test_split            # split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, classification_report, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(Transformed_X, y, test_size = 0.2, random_state = 1)
Model = LogisticRegression(random_state = 1)
Model.fit(X_train, y_train)
y_pred = Model.predict(X_test)

#Model Evaluation
print('Sensitivity_score = ', recall_score(y_test, y_pred))
print('Specificity_score = ', recall_score(y_test, y_pred, pos_label=0))
print('Accuracy_score = ',accuracy_score(y_test, y_pred))
print('Precision_score = ',precision_score(y_test, y_pred))
print('F_score = ',f1_score(y_test, y_pred))
print('-------------------------------------------------------------------')
print('Classification Report')
print('-------------------------------------------------------------------')
print(classification_report(y_test, y_pred))