In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


### Load Data

In [None]:
file_path = r'C:\Users\krish\Downloads\daily_weather (1).csv'
df = pd.read_csv(file_path)
print(f'Shape: {{df.shape}}')
df.head()


### Handle Missing Values

In [None]:
df['air_pressure_9am'].fillna(df['air_pressure_9am'].mean(), inplace=True)
df['air_temp_9am'].fillna(df['air_temp_9am'].mean(), inplace=True)
df['avg_wind_direction_9am'].fillna(df['avg_wind_direction_9am'].mean(), inplace=True)
df['max_wind_direction_9am'].fillna(df['max_wind_direction_9am'].mean(), inplace=True)
df['rain_accumulation_9am'].fillna(df['rain_accumulation_9am'].median(), inplace=True)
df['rain_duration_9am'].fillna(df['rain_duration_9am'].median(), inplace=True)
df['avg_wind_speed_9am'].fillna(df['avg_wind_speed_9am'].median(), inplace=True)
df['max_wind_speed_9am'].fillna(df['max_wind_speed_9am'].median(), inplace=True)


### Handle Outliers

In [None]:
# Remove Rows Outliers: rain_accumulation_9am
Q1 = df['rain_accumulation_9am'].quantile(0.25)
Q3 = df['rain_accumulation_9am'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['rain_accumulation_9am'] >= Q1-1.5*IQR) & (df['rain_accumulation_9am'] <= Q3+1.5*IQR)]
# Remove Rows Outliers: rain_duration_9am
Q1 = df['rain_duration_9am'].quantile(0.25)
Q3 = df['rain_duration_9am'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['rain_duration_9am'] >= Q1-1.5*IQR) & (df['rain_duration_9am'] <= Q3+1.5*IQR)]
# Cap Outliers: air_pressure_9am
Q1 = df['air_pressure_9am'].quantile(0.25)
Q3 = df['air_pressure_9am'].quantile(0.75)
IQR = Q3 - Q1
df['air_pressure_9am'] = df['air_pressure_9am'].clip(lower=Q1-1.5*IQR, upper=Q3+1.5*IQR)
# Cap Outliers: air_temp_9am
Q1 = df['air_temp_9am'].quantile(0.25)
Q3 = df['air_temp_9am'].quantile(0.75)
IQR = Q3 - Q1
df['air_temp_9am'] = df['air_temp_9am'].clip(lower=Q1-1.5*IQR, upper=Q3+1.5*IQR)
# Cap Outliers: avg_wind_speed_9am
Q1 = df['avg_wind_speed_9am'].quantile(0.25)
Q3 = df['avg_wind_speed_9am'].quantile(0.75)
IQR = Q3 - Q1
df['avg_wind_speed_9am'] = df['avg_wind_speed_9am'].clip(lower=Q1-1.5*IQR, upper=Q3+1.5*IQR)
# Cap Outliers: max_wind_speed_9am
Q1 = df['max_wind_speed_9am'].quantile(0.25)
Q3 = df['max_wind_speed_9am'].quantile(0.75)
IQR = Q3 - Q1
df['max_wind_speed_9am'] = df['max_wind_speed_9am'].clip(lower=Q1-1.5*IQR, upper=Q3+1.5*IQR)
# Cap Outliers: relative_humidity_9am
Q1 = df['relative_humidity_9am'].quantile(0.25)
Q3 = df['relative_humidity_9am'].quantile(0.75)
IQR = Q3 - Q1
df['relative_humidity_9am'] = df['relative_humidity_9am'].clip(lower=Q1-1.5*IQR, upper=Q3+1.5*IQR)


### Univariate Analysis

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))
    sns.histplot(df[col], kde=True, ax=ax[0])
    ax[0].set_title(f'Dist of {col}')
    sns.boxplot(x=df[col], ax=ax[1])
    ax[1].set_title(f'Boxplot of {col}')
    plt.show()


### Bivariate Analysis

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns
# Correlation Heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(df[numeric_cols].corr(method='pearson'), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Pairplot
sns.pairplot(df[numeric_cols], hue='None' if 'None' != 'None' else None)
plt.show()


### Handle Multicollinearity

In [None]:
df.drop(columns=['max_wind_speed_9am'], inplace=True)


### Feature Scaling

In [None]:
scaler = StandardScaler()
df[['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am', 'max_wind_direction_9am', 'relative_humidity_3pm']] = scaler.fit_transform(df[['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am', 'max_wind_direction_9am', 'relative_humidity_3pm']])
scaler = MinMaxScaler()
df[['relative_humidity_9am', 'avg_wind_speed_9am']] = scaler.fit_transform(df[['relative_humidity_9am', 'avg_wind_speed_9am']])
