<a href="https://colab.research.google.com/github/NandakrishnanR/Kaggle_projects/blob/master/DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jsphyg/weather-dataset-rattle-package")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'weather-dataset-rattle-package' dataset.
Path to dataset files: /kaggle/input/weather-dataset-rattle-package


In [67]:
import pandas as pd
raw_df=pd.read_csv(f"/root/.cache/kagglehub/datasets/jsphyg/weather-dataset-rattle-package/versions/2/weatherAUS.csv")

In [68]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
import os
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [69]:
print(raw_df.isnull().sum())

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64


In [70]:
#Given that 'RainTomorrow' is almost certainly your target variable for a prediction task, it's generally best to remove the rows where 'RainTomorrow' is null. Imputing a categorical target variable can lead to your model learning from artificially generated labels, making your model's evaluation and predictions unreliable. You have 3267 missing values for 'RainTomorrow' out of 145,460 entries, which is about 2.2%. Removing these rows is a reasonable approach to ensure data quality for your target variable.
raw_df.dropna(subset = ['RainTomorrow'], inplace = True)


## Preparing the Data for Training

steps to prepare the dataset for training:

1. Create a train/test/validation split
2. Identify input and target columns
3. Identify numeric and categorical columns
4. Impute (fill) missing numeric values
5. Scale numeric values to the $(0, 1)$ range
6. Encode categorical columns to one-hot vectors

In [71]:
year = pd.to_datetime(raw_df['Date']).dt.year
raw_df['year'] = year
from plotly import express as px
px.histogram(raw_df, x='year', color='RainToday', title='Rain Today by Year')

In [72]:
train_df=raw_df[raw_df['year']<2016]
val_df=raw_df[raw_df['year'] == 2016]
test_df=raw_df[raw_df['year'] > 2016]

In [73]:
input_col=list(raw_df.columns)
input_col.remove('RainTomorrow')
input_col.remove('Date')
input_col.remove('year')
target_col='RainTomorrow'
train_input=train_df[input_col].copy()
train_target=train_df[target_col].copy()
val_input=val_df[input_col].copy()
val_target=val_df[target_col].copy()
test_input=test_df[input_col].copy()
test_target=test_df[target_col].copy()

In [74]:
#ony training set needed fitting,others can transfer them
numerical_cols=train_input.select_dtypes(include=['int64', 'float64'])
categorical_cols=train_input.select_dtypes(include=['object'])
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean')
train_input[numerical_cols.columns]=imputer.fit_transform(train_input[numerical_cols.columns])
val_input[numerical_cols.columns]=imputer.transform(val_input[numerical_cols.columns])
test_input[numerical_cols.columns]=imputer.transform(test_input[numerical_cols.columns])

In [75]:
train_input.isna().sum()

Unnamed: 0,0
Location,0
MinTemp,0
MaxTemp,0
Rainfall,0
Evaporation,0
Sunshine,0
WindGustDir,7870
WindGustSpeed,0
WindDir9am,8437
WindDir3pm,2550


In [76]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
train_input[numerical_cols.columns]=scaler.fit_transform(train_input[numerical_cols.columns])
val_input[numerical_cols.columns]=scaler.transform(val_input[numerical_cols.columns])
test_input[numerical_cols.columns]=scaler.transform(test_input[numerical_cols.columns])

In [81]:
from sklearn.preprocessing import OneHotEncoder

# Re-create input dataframes to ensure they contain all original columns (numerical and categorical)
# This step is crucial to ensure categorical columns are present for encoding.
input_col = list(raw_df.columns)
input_col.remove('RainTomorrow')
input_col.remove('Date')
input_col.remove('year')

train_input = train_df[input_col].copy()
val_input = val_df[input_col].copy()
test_input = test_df[input_col].copy()

# Identify numerical and categorical column names from the current state of train_input
numerical_cols_names = train_input.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_names = train_input.select_dtypes(include=['object']).columns

# Re-apply numerical imputation and scaling if these operations were part of the previous steps
# and you want to ensure the state of the numerical columns is consistent.
from sklearn.impute import SimpleImputer
numerical_imputer=SimpleImputer(strategy='mean')
numerical_imputer.fit(train_input[numerical_cols_names])

train_input.loc[:, numerical_cols_names] = numerical_imputer.transform(train_input[numerical_cols_names])
val_input.loc[:, numerical_cols_names] = numerical_imputer.transform(val_input[numerical_cols_names])
test_input.loc[:, numerical_cols_names] = numerical_imputer.transform(test_input[numerical_cols_names])

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(train_input[numerical_cols_names])

train_input.loc[:, numerical_cols_names] = scaler.transform(train_input[numerical_cols_names])
val_input.loc[:, numerical_cols_names] = scaler.transform(val_input[numerical_cols_names])
test_input.loc[:, numerical_cols_names] = scaler.transform(test_input[numerical_cols_names])


# One-Hot Encoding Process (Corrected and Robust Implementation)
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the categorical columns of the training input data
encoder.fit(train_input[categorical_cols_names])

# Transform categorical columns in train, validation, and test sets
train_categorical_encoded = encoder.transform(train_input[categorical_cols_names])
val_categorical_encoded = encoder.transform(val_input[categorical_cols_names])
test_categorical_encoded = encoder.transform(test_input[categorical_cols_names])

# Get new column names from the encoder (ensuring robust type handling by casting to list)
encoded_feature_names = encoder.get_feature_names_out(list(categorical_cols_names))

# Create DataFrames from the encoded arrays
train_encoded_df = pd.DataFrame(train_categorical_encoded, columns=encoded_feature_names, index=train_input.index)
val_encoded_df = pd.DataFrame(val_categorical_encoded, columns=encoded_feature_names, index=val_input.index)
test_encoded_df = pd.DataFrame(test_categorical_encoded, columns=encoded_feature_names, index=test_input.index)

# Drop original categorical columns from input dataframes
train_input = train_input.drop(columns=categorical_cols_names)
val_input = val_input.drop(columns=categorical_cols_names)
test_input = test_input.drop(columns=categorical_cols_names)

# Concatenate numerical and one-hot encoded categorical columns
train_input = pd.concat([train_input, train_encoded_df], axis=1)
val_input = pd.concat([val_input, val_encoded_df], axis=1)
test_input = pd.concat([test_input, test_encoded_df], axis=1)

ValueError: input_features is not equal to feature_names_in_