In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


df = pd.read_csv(filepath_or_buffer="weatherAUS.csv")


In [None]:
df.head()

In [None]:
df.shape


In [None]:
column_names =  df.columns
column_names , f"Number of columns is ==> {len(column_names)}"

In [None]:
df.describe()

## Drop RISK_MM Variable

In [None]:
# it is already droped from this DB version 
# df.drop(['RISK_MM'], axis=1, inplace=1)

In [None]:
df.info()
# RainTomorrow ==> feature variable will be the output[y] that we need to pridict 

## Types of Variables 
* Categorical variables of dtype=='O'==> object
* Numerical variables of dtype == 'f32,......'  

In [None]:
Categorical = [var for var in df.columns if df[var].dtype == 'O']

print(f"there are {len(Categorical)} Categorical variable")
print(f"the Categorical variables are {Categorical}")

In [None]:
# view Categorical variables
df[Categorical].head()

## Summary of categorical variables¶
* There is a date variable. It is denoted by Date column.
* There are 6 categorical variables. These are given by Location, WindGustDir, WindDir9am, WindDir3pm, RainToday and * RainTomorrow.
* There are two binary categorical variables - RainToday and RainTomorrow.
* RainTomorrow is the target variable.

## Explore problems within categorical variables¶
    First, I will explore the categorical variables.

    Missing values in categorical variables

In [None]:
# check missing values in categorical variables
# For each Categorical feature , we will count the missing values for it 
df[Categorical].isnull().sum()

In [None]:
# printing Categorical variables only containing missing values

cat1 = [var for var in Categorical if df[var].isnull().sum() != 0]

print(f"{df[cat1].isnull().sum()}")


We can see that there are only 4 categorical variables in the dataset which contains missing values. These are WindGustDir, WindDir9am, WindDir3pm and RainToday.

Frequency counts of categorical variables
Now, I will check the frequency counts of categorical variables.

In [None]:
# view frequency of categorical variables

for var in Categorical:
    print(df[var].value_counts())

In [None]:
# view frequency distribution of categorical variables

for var in Categorical:
    print(f"{ df[var].value_counts() / np.float64(len(df)) }")

Number of labels: cardinality
The number of labels within a categorical variable is known as cardinality. A high number of labels within a variable is known as high cardinality. High cardinality may pose some serious problems in the machine learning model. So, I will check for high cardinality.

In [None]:
# check for cardinality in categorical variables

for var in Categorical:
    print(f"{var} contains {len(df[var].unique())}")

## Feature engineering of feature Date

In [None]:
df['Date'].dtypes

we can see the data type of 'Date' variable is object, i will parse Data[coded as an object] into datetime foramt so i can seperate the Date into year,month,day 

In [None]:
# Parse Data variable ==> dates currently it is represented as string into datatime formate
df["Date"] = pd.to_datetime(df["Date"])


In [None]:
# Extract year from Date

# df["year"] = np.int64(df["Date"].dt.year)
df["year"] = df["Date"].dt.year
df['year'].head()

In [None]:
# Extract Month from Date
df["month"] = df["Date"].dt.month
df["month"].head()

In [None]:
# Extract day from Date
df["day"] = df["Date"].dt.day
df["day"].head()

In [None]:
# we have added 3 columns to decompose the Data variable 
df.info()

In [None]:
# we will remove Data variable since we replace it with year/month/day
df.drop(columns=["Date"], inplace=True)

In [None]:
# we will see the Date variable is no longer present anymore
df.head()

In [None]:
# now we can see the year/month/day columns
df.tail()

In [None]:
df.info()

## Explore Location variable¶


In [None]:
# print number of labels in Location variable
print(f"Location contains {len(df.Location.unique())} lables")

In [None]:
# check labels in location variable

df.Location.unique()

In [None]:
# check frequency distribution of values in Location variable
df.Location.value_counts()

## To convert catogerical variable into numerical one we will use one of the following:
* Lable Encoding
* One Hot Encoding
* Dummies 

In [None]:
# we have 49 labels but we will drop the first and keep 48[we will be able to deduce the 49th] and we need the model to faulty[just a way for optimization] 
# so we can overcome overfitting
# let's do One Hot Encoding of Location variable
# get k-1 dummy variables after One Hot Encoding
# preview the dataset with head() method


pd.get_dummies(df.Location, drop_first=True).head()

In [None]:
# re-viewing the Data set again
df.head()

In [None]:
remaining_categorical = [var for var in df.columns if df[var].dtype == 'O' ]
print(f"{remaining_categorical}")


## Explore WindGustDir variable¶


In [None]:
# print number of labels in WindGustDir variable
print(f"WindGustDir contains {len(df.WindGustDir.unique())} lables")

In [None]:
# check the lables in WindGustDir variable
df.WindGustDir.unique()

In [None]:
# check frequency distribution of values in WindGustDir variable
df.WindGustDir.value_counts()

In [None]:
# let's do One Hot Encoding of WindGustDir variable
# get k-1 dummy variables after One Hot Encoding
# preview the dataset with head() method


dummies = pd.get_dummies(df.WindGustDir, drop_first=True, dummy_na=True)
dummies.head()

In [None]:
# CASE 1: axis=0 (Summing DOWN)
# This counts how many times 'N', 'S', 'W', or 'NaN' occurred. similiar to ==>[df.WindGustDir.value_counts()]
col_sums = dummies.sum(axis=0)
print("--- sum(axis=0) : Column Counts ---")
print(col_sums)

# CASE 2: axis=1 (Summing ACROSS)
# This sums the row. Since it's one-hot encoded,
# it confirms that each row has exactly one entry marked as 1.
row_sums = dummies.sum(axis=1)
print("\n--- sum(axis=1) : Row Totals ---")
print(row_sums)




we have NaN = 10326(missing values) needed to fixed 

In [None]:
print(f"{[var for var in df.columns if df[var].dtype == 'O']}")

## Explore WindDir9am variable¶


In [None]:
# print number of labels in WindDir9am variable
print(f"WindGustDir contains {len(df.WindDir9am.unique())} lables")

In [None]:
# check the lables
df.WindDir9am.unique()

In [None]:
# check the frequence distribution of the lables inside WindDir9am feature
df.WindDir9am.value_counts()

In [None]:
# let's do One Hot Encoding of WindDir9am variable
# get k-1 dummy variables after One Hot Encoding
# preview the dataset with head() method


WindDir9am_dummies =  pd.get_dummies(df.WindDir9am, drop_first=True, dummy_na=True).head()
WindDir9am_dummies


In [None]:
#it is similair to .count_values()
WindDir9am_dummies.sum(axis=0) 

## Explore RainToday variable¶
   this is the output lable  

In [None]:
# print number of labels in RainToday variable
print(f"WindGustDir contains {len(df.RainToday.unique())} lables")

In [None]:
# check the lables names
df.RainToday.unique()

In [None]:
# check the distribuation for the lables inside the reponse feature
df.RainToday.value_counts()

In [None]:
# RainToday_dummies = pd.get_dummies(df.RainToday, dummy_na=True)
# 	No	Yes	NaN
# 0	True	False	False
# 1	True	False	False
# 2	True	False	False
# 3	True	False	False
# 4	True	False	False
RainToday_dummies = pd.get_dummies(df.RainToday,drop_first=True, dummy_na=True)
RainToday_dummies.head()

In [None]:
RainToday_dummies.sum(axis=0)

## Exploring the Numerical Values

In [None]:
numerical = [var for var in df.columns if df[var].dtype != 'O']

print(f"Number of numerical features is {len(numerical)}")
print(f"Numerical features is {numerical}")

In [None]:
# view the numerical variables 
df[numerical].head()

In [None]:
# Check the missing values in the numerical values
df[numerical].isnull().sum()

In [None]:
# Veiw summary statistics in numerical values

# df[numerical].describe()
#Round the precision number for better view [representation] 
round(df[numerical].describe(), 2)