In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
import warnings
import numpy as np

In [None]:
warnings.filterwarnings("ignore")
data = pd.read_csv("weatherAUS.csv")

In [None]:
data.shape
data.info()
data.head()

In [None]:
data["RainToday"].replace({"No": 0, "Yes": 1}, inplace=True)
data["RainTomorrow"].replace({"No": 0, "Yes": 1}, inplace=True)

In [None]:
fig = plt.figure(figsize=(10, 6))
data.RainTomorrow.value_counts().plot(kind="bar", color=["blue", "orange"])
plt.title("Rain Tomorrow")
plt.xlabel("Rain Tomorrow")
plt.ylabel("Count")
plt.xticks([0, 1], ["No", "Yes"], rotation=0)
plt.show()



In [None]:
rain_no = data[data["RainTomorrow"] == 0]
rain_yes = data[data["RainTomorrow"] == 1]
yes_oversampled = resample(rain_yes,
                          replace=True,  # sample with replacement
                          n_samples=len(rain_no),  # to match majority class
                          random_state=123)  # reproducible results
over_sampled = pd.concat([rain_no, yes_oversampled])

In [None]:
fig = plt.figure(figsize=(10, 6))
over_sampled.RainTomorrow.value_counts().plot(kind="bar", color=["blue", "orange"])
plt.title("Rain Tomorrow (Oversampled)")
plt.xlabel("Rain Tomorrow")
plt.ylabel("Count")
plt.xticks([0, 1], ["No", "Yes"], rotation=0)
plt.show()

In [None]:
### missing values heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.isnull(), cbar=False)
plt.title("Missing Values Heatmap")

In [None]:
# dealing with missing values
total = over_sampled.isnull().sum().sort_values(ascending=False)
percent = (over_sampled.isnull().sum() / over_sampled.isnull().count()).sort_values(ascending=False)
missing = pd.concat([total, percent.map("{:.3f}".format)], axis=1, keys=["Total", "Percent"])
missing.head(20)

In [None]:
# transforming null values
over_sampled["RainToday"].fillna(over_sampled["RainToday"].mode()[0], inplace=True)
over_sampled["RainTomorrow"].fillna(over_sampled["RainToday"].mode()[0], inplace=True)

In [None]:
## input categorical values with mode
over_sampled["Date"] = over_sampled["Date"].fillna(over_sampled["Date"].mode()[0])
over_sampled["WindGustDir"] = over_sampled["WindGustDir"].fillna(over_sampled["WindGustDir"].mode()[0])
over_sampled["Location"] = over_sampled["Location"].fillna(over_sampled["Location"].mode()[0])
over_sampled["WindDir9am"] = over_sampled["WindDir9am"].fillna(over_sampled["WindDir9am"].mode()[0])
over_sampled["WindDir3pm"] = over_sampled["WindDir3pm"].fillna(over_sampled["WindDir3pm"].mode()[0])


In [None]:
from sklearn.preprocessing import LabelEncoder
cat_col = over_sampled.select_dtypes(include=["object"]).columns.tolist()
lencoder ={}
for col in cat_col:
    lencoder[col] = LabelEncoder()
    over_sampled[col] = lencoder[col].fit_transform(over_sampled[col])
# check if there are any missing values left
over_sampled.isna().sum()  # should be 0

In [None]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
MiceImputes = over_sampled.copy(deep=True)
Mice_imputer = IterativeImputer(max_iter=10, random_state=0)
MiceImputes.iloc[:, :] = Mice_imputer.fit_transform(over_sampled)

In [None]:
MiceImputes.isna().sum()  # should be 0
# check the data types

In [None]:
## outliers Treatment
Q1 = MiceImputes.quantile(0.25)
Q3 = MiceImputes.quantile(0.75)
IQR = Q3 - Q1
MiceImputes = MiceImputes[~((MiceImputes < (Q1 - 1.5 * IQR)) | (MiceImputes > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
MiceImputes.shape

In [None]:
# corelation heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(MiceImputes.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

corr = MiceImputes.corr()
plt.figure(figsize=(20, 20))
mask = np.triu(np.ones_like(corr, dtype=bool))
f ,ax = plt.subplots(figsize=(20, 20))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,annot=True, fmt=".2f",
            square=True, linewidths=.5, cbar_kws={"shrink": .9})
plt.title("Correlation Heatmap")

In [None]:
# sns.pairplot(MiceImputes, hue="RainTomorrow", diag_kind="kde", palette="husl")

In [None]:
from sklearn.preprocessing import StandardScaler
r_scaler = StandardScaler()
r_scaler.fit(MiceImputes)
modified_data = pd.DataFrame(r_scaler.transform(MiceImputes), index=MiceImputes.index, columns=MiceImputes.columns)
modified_data.head()

In [None]:
from sklearn.feature_selection import SelectKBest,chi2
x = modified_data.loc[:, modified_data.columns != "RainTomorrow"]
y = modified_data.loc[:, modified_data.columns == "RainTomorrow"]
selector = SelectKBest(chi2, k=10,)
x_new = selector.fit_transform(x, y)