$$Logic Regression


In [1]:
!pip install pandas --quiet
!pip install scikit-learn --quiet
!pip install numpy --quiet
import pandas as pd
import numpy as np


In [2]:
raw_df = pd.read_csv('weatherAUS.csv')
raw_df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
# remove null values rows
raw_df.dropna(subset=['RainTomorrow','RainTomorrow'], inplace=True)
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142193 entries, 0 to 145458
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null   fl

In [5]:
try:  import plotly.express as px
except ModuleNotFoundError:
    %pip install plotly
    import plotly.express as px

px.histogram(raw_df, x='Location' , title="Location vs. RainyDays",color = 'RainToday')

In [6]:
px.histogram(raw_df,x='Temp3pm',title='Temperature at 3pm', color='RainTomorrow')

Set sample


In [7]:
use_sample = False 
sample_fraction = 0.1
if use_sample:
    raw_df = raw_df.sample(frac=sample_fraction.copy())

In [8]:
from sklearn.model_selection import train_test_split


In [9]:
train_val_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2


In [10]:
print(f"Train set: {train_df.shape}")
print(f"Validation set: {val_df.shape}")
print(f"Test set: {test_df.shape}")

Train set: (85315, 23)
Validation set: (28439, 23)
Test set: (28439, 23)


In [11]:
# to make the accuracy in predication use to the technique where we use data till 2017 as the training data and after 2015 as the testing data
year  = pd.to_datetime(raw_df.Date).dt.year
train_df = raw_df[year <= 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

#Identifying Input & target Columns


In [12]:
input_cols = list(train_df.columns)[1:-1] #ignore date col
target_col = 'RainTomorrow' #output col




In [13]:
numeric_cols = train_df[input_cols].select_dtypes(include=np.number).columns.tolist()  # Exclude target column if numeric
categorical_cols = train_df[input_cols].select_dtypes(include='object').columns.tolist()

In [14]:
train_inputs = train_df[numeric_cols].describe()
train_inputs

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,115742.0,115990.0,115077.0,70646.0,66358.0,108409.0,115004.0,114709.0,114733.0,114365.0,104885.0,104923.0,73862.0,72434.0,115405.0,115078.0
mean,12.03241,23.07737,2.337669,5.367306,7.619633,40.131853,14.06112,18.721983,68.579118,51.350964,1017.750217,1015.358392,4.334001,4.427658,16.868786,21.579581
std,6.375807,7.065463,8.483712,4.048918,3.781878,13.607206,8.948117,8.857876,19.130313,20.801623,7.10381,7.036906,2.876184,2.705274,6.463913,6.898924
min,-8.5,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,979.0,0.0,0.0,-6.2,-5.1
25%,7.5,17.9,0.0,2.6,4.9,31.0,7.0,13.0,57.0,37.0,1013.0,1010.6,1.0,2.0,12.2,16.6
50%,11.8,22.5,0.0,4.6,8.5,39.0,13.0,19.0,70.0,52.0,1017.7,1015.3,5.0,5.0,16.6,21.0
75%,16.7,28.0,0.6,7.2,10.6,48.0,19.0,24.0,83.0,65.0,1022.5,1020.1,7.0,7.0,21.4,26.3
max,33.9,48.1,371.0,82.4,14.5,135.0,87.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.1


In [15]:
train_df[categorical_cols].nunique()

Location       49
WindGustDir    16
WindDir9am     16
WindDir3pm     16
RainToday       2
dtype: int64

$Inputing 
The process of filling missing values is called imputation 

In [16]:
# fill average value
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

before imputation lets check the no. of missing values in each numeric column

In [17]:
raw_df[numeric_cols].isna().sum()

MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustSpeed     9270
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
dtype: int64

In [18]:
train_df[numeric_cols] = imputer.fit_transform(train_df[numeric_cols])
val_df[numeric_cols] = imputer.transform(val_df[numeric_cols])
test_df[numeric_cols] = imputer.transform(test_df[numeric_cols]) 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



$$Scaler


In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(raw_df[numeric_cols])

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [20]:
scaler.fit(raw_df[numeric_cols])

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [21]:
train_df[numeric_cols] = scaler.transform(train_df[numeric_cols])
val_df[numeric_cols] = scaler.transform(val_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



$$Encoding

In [22]:
raw_df[categorical_cols].nunique()

Location       49
WindGustDir    16
WindDir9am     16
WindDir3pm     16
RainToday       2
dtype: int64

In [23]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(raw_df[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
train_df[encoded_cols] = encoder.transform(train_df[categorical_cols])
val_df[encoded_cols] = encoder.transform(val_df[categorical_cols])
test_df[encoded_cols] = encoder.transform(test_df[categorical_cols])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

$$Pyarrow_Format

In [24]:
!pip install pyarrow --quiet

In [25]:

train_df.to_parquet('train_inputs.parquet')
val_df.to_parquet('val_inputs.parquet')
test_df.to_parquet('test_inputs.parquet')

In [26]:
pd.DataFrame(train_df).to_parquet('train_inputs.parquet')
pd.DataFrame(val_df).to_parquet('val_inputs.parquet')   
pd.DataFrame(test_df).to_parquet('test_inputs.parquet') 