In [1]:
#installing featuretools
!pip install featuretools



In [2]:
#importing our packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
import featuretools as ft

In [3]:
#our df
df = pd.read_csv("C:/Users/moe/Desktop/df_from_EDA.csv")
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow
0,87.524795,75.655455,28.379506,0.0,69.617966,1026.030278,0
1,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0
2,80.94305,64.740043,14.184831,0.916884,77.364763,980.796739,1
3,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0
4,37.059963,34.766784,3.689661,1.361272,85.584,1031.790859,0


In [4]:
df.shape

(73100, 7)

In [5]:
max(df['Precipitation'])

3.078090164668743

In [6]:
# Feature 1: Humidity-Temperature Interaction
df['Humidity_Temp_Interaction'] = df['Humidity'] * df['Temperature']

#bins for Precipitation based on quantiles
df['Precipitation_Bin'] = pd.qcut(df['Precipitation'], q=2, labels=['Low', 'High'])

#making bins for pressure_category
df['Pressure_Category'] = pd.cut(df['Pressure'], bins=[950, 975, 1000, 1025, 1050], labels=['Low', 'Medium', 'High', 'Very High'])

df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow,Humidity_Temp_Interaction,Precipitation_Bin,Pressure_Category
0,87.524795,75.655455,28.379506,0.0,69.617966,1026.030278,0,6621.728203,Low,Very High
1,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0,2390.593142,High,Medium
2,80.94305,64.740043,14.184831,0.916884,77.364763,980.796739,1,5240.256547,High,Medium
3,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0,4665.468416,Low,Medium
4,37.059963,34.766784,3.689661,1.361272,85.584,1031.790859,0,1288.455717,High,Very High


In [12]:
# 1. Temperature Category
df['Temperature_Category'] = pd.cut(df['Temperature'], bins=[0, 60, 75, 85, 100], labels=['Cold', 'Mild', 'Warm', 'Hot'])

# 2. Wind Speed Category
df['Wind_Speed_Category'] = pd.cut(df['Wind Speed'], bins=[0, 5, 15, 25, 35], labels=['Calm', 'Breezy', 'Windy', 'Very Windy'])

# 3. Precipitation Indicator
df['Rain_Indicator'] = (df['Precipitation'] > 0).astype(int)

# 4. High Humidity Indicator
df['High_Humidity_Indicator'] = (df['Humidity'] > 70).astype(int)

# Display the updated DataFrame with new features
print("Updated DataFrame with New Features:")
df.head()

Updated DataFrame with New Features:


Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow,Humidity_Temp_Interaction,Precipitation_Bin,Pressure_Category,Temperature_Category,Wind_Speed_Category,Rain_Indicator,High_Humidity_Indicator
0,87.524795,75.655455,28.379506,0.0,69.617966,1026.030278,0,6621.728203,Low,Very High,Hot,Very Windy,0,1
1,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0,2390.593142,High,Medium,Warm,Breezy,1,0
2,80.94305,64.740043,14.184831,0.916884,77.364763,980.796739,1,5240.256547,High,Medium,Warm,Breezy,1,0
3,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0,4665.468416,Low,Medium,Warm,Windy,1,0
4,37.059963,34.766784,3.689661,1.361272,85.584,1031.790859,0,1288.455717,High,Very High,Cold,Calm,1,0


In [None]:
#we wont do transform or standarize our data since the models we want to do is decision tree and random forest
#and since these data dont require such steps we wont do it so we can have better binterpretability.

In [None]:
#now that we created some features lets plit our dependent and independent variables and make training and testing
#data

In [13]:
#lets define our X and y
X = df.drop('Rain Tomorrow', axis=1)  # Features (all columns except the target)
y = df['Rain Tomorrow']  # Vector (the column we want to predict)
X.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Humidity_Temp_Interaction,Precipitation_Bin,Pressure_Category,Temperature_Category,Wind_Speed_Category,Rain_Indicator,High_Humidity_Indicator
0,87.524795,75.655455,28.379506,0.0,69.617966,1026.030278,6621.728203,Low,Very High,Hot,Very Windy,0,1
1,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,2390.593142,High,Medium,Warm,Breezy,1,0
2,80.94305,64.740043,14.184831,0.916884,77.364763,980.796739,5240.256547,High,Medium,Warm,Breezy,1,0
3,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,4665.468416,Low,Medium,Warm,Windy,1,0
4,37.059963,34.766784,3.689661,1.361272,85.584,1031.790859,1288.455717,High,Very High,Cold,Calm,1,0


In [14]:
# train-test split using X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Training set size:", X_train.shape, y_train.shape)
print("Testing set size:", X_test.shape, y_test.shape)

Training set size: (58480, 13) (58480,)
Testing set size: (14620, 13) (14620,)


In [16]:
%store X_train
%store y_train
%store X_test
%store y_test
%store df

Stored 'X_train' (DataFrame)
Stored 'y_train' (Series)
Stored 'X_test' (DataFrame)
Stored 'y_test' (Series)
Stored 'df' (DataFrame)
