In [27]:
import pandas as pd
import numpy as np

In [None]:
# Install UC Irvine Machine Learning Repository
!pip install ucimlrepo

In [None]:
# import fetch_repo function from ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
tetouan = fetch_ucirepo(id=849)
  
# data (as pandas dataframes) 
X = tetouan.data.features 
y = tetouan.data.targets 
  
# metadata 
print(tetouan.metadata) 
  
# variable information 
print(tetouan.variables) 


In [None]:
tetouan.data.features.head()

In [None]:
y.head()

In [None]:
# Check for duplicates and nulls

In [None]:
X.duplicated().sum()

In [None]:
y.duplicated().sum()

In [None]:
X.isnull().sum()

In [None]:
y.isnull().sum()

In [None]:
def explore_df(df):
    """
    Explores the dataframe via its dims, dtypes, null counts of each column,
    and value counts for each columns

    Arguments: df (pd.Dataframe) - Dataframe to explore
    Returns: None
    """
    sep_lines = '\n' + '-'*50 + '\n'
    end_lines = '\n' + '='*50 + '\n'
    print("Dataframe shape: ")
    print(f"{df.shape[0]} rows X {df.shape[1]} columns", end=end_lines)

    print("Dataframe data types")
    print(df.dtypes,end=end_lines)

    print(f"Null Count:")
    null_df = pd.concat([df.isnull().sum(), df.isnull().mean()],axis=1)
    null_df.columns = ['count','normalize_count']
    print(null_df, end=end_lines)

    print(f"{df.columns}", end=end_lines)
    print("Value counts for each  categorical column:")
    cat_from_num = df.select_dtypes("number").loc[:, df.select_dtypes("number").nunique() < 20]
    df_categorical = pd.concat([df.select_dtypes("object"), cat_from_num], axis=1)
    for col in df.columns:
        print(df[col].value_counts(dropna=False),end=sep_lines)

    print(end_lines.strip('\n'))
    print(df.describe(include='all'))

In [None]:
explore_df(X)

In [None]:
explore_df(y)

In [None]:
# Convert X Datetime column timestamps into datetime format
X.loc[:, 'DateTime_cleaned'] = pd.to_datetime(X['DateTime'])
# Create new columns to show day of the week and category of the day from the DateTime_cleaned column
X.loc[:, 'DayOfWeek'] = X['DateTime_cleaned'].dt.day_name()
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# Apply the function to create the 'TimeOfDay' column
X.loc[:, 'TimeOfDay'] = X['DateTime_cleaned'].dt.hour.apply(categorize_time_of_day)

In [None]:
tetouan_df = pd.concat([X,y],axis=1)

In [None]:
# tetouan_df_cleaned = tetouan_df.drop(columns=['DayCategory','DateTime2'],inplace=True)

# Get_dummies for categorical columns
tetouan_df_cleaned = pd.get_dummies(tetouan_df, columns=['DayOfWeek','TimeOfDay'])
# Reorganize the 3 power consumption to the end of the column list 
# Ensure all columns exist in the DataFrame
expected_columns = [
    'DateTime_cleaned', 'Temperature', 'Humidity', 'Wind Speed',
    'DayOfWeek_Monday', 'DayOfWeek_Tuesday', 'DayOfWeek_Wednesday', 
    'DayOfWeek_Thursday', 'DayOfWeek_Friday', 'DayOfWeek_Saturday', 
    'DayOfWeek_Sunday', 'TimeOfDay_Morning', 'TimeOfDay_Afternoon', 
    'TimeOfDay_Evening', 'TimeOfDay_Night', 
    'Zone 1 Power Consumption', 'Zone 2  Power Consumption', 'Zone 3  Power Consumption'
]

# Select only the expected columns
tetouan_df_cleaned = tetouan_df_cleaned[expected_columns]

# Display the first few rows of the cleaned DataFrame
tetouan_df_cleaned.head()


In [None]:
tetouan_df_cleaned.columns

In [31]:
tetouan_df_cleaned.rename(columns= {'DateTime_cleaned':'DateTime', 
                                    'Temperature':'Temp', 
                                    'Wind Speed':'Wind_Speed',
                                    'DayOfWeek_Monday':'Monday',
                                    'DayOfWeek_Tuesday':'Tuesday',
                                    'DayOfWeek_Wednesday':'Wednesday',
                                    'DayOfWeek_Thursday':'Thursday', 
                                    'DayOfWeek_Friday':'Friday', 
                                    'DayOfWeek_Saturday': 'Saturday',
                                    'DayOfWeek_Sunday': 'Sunday', 
                                    'TimeOfDay_Morning' : 'Morning', 
                                    'TimeOfDay_Afternoon': 'Afternoon',
                                    'TimeOfDay_Evening': 'Evening', 
                                    'TimeOfDay_Night': 'Night', 
                                    'Zone 1 Power Consumption': 'Zone_1_PC',
                                    'Zone 2  Power Consumption': 'Zone_2_PC', 
                                    'Zone 3  Power Consumption': 'Zone_3_PC'},
                                    inplace=True)

In [34]:
# Converting columns from boolean to int
columns_to_convert = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Morning', 'Afternoon', 'Evening', 'Night']
for column in columns_to_convert:
    tetouan_df_cleaned[column] = tetouan_df_cleaned[column].astype(int)

print(tetouan_df_cleaned['Temp'].min(),tetouan_df_cleaned['Temp'].ma)

Unnamed: 0,DateTime,Temp,Humidity,Wind_Speed,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Morning,Afternoon,Evening,Night,Zone_1_PC,Zone_2_PC,Zone_3_PC
0,2017-01-01 00:00:00,6.559,73.8,0.083,0,0,0,0,0,0,1,0,0,0,1,34055.6962,16128.87538,20240.96386
1,2017-01-01 00:10:00,6.414,74.5,0.083,0,0,0,0,0,0,1,0,0,0,1,29814.68354,19375.07599,20131.08434
2,2017-01-01 00:20:00,6.313,74.5,0.08,0,0,0,0,0,0,1,0,0,0,1,29128.10127,19006.68693,19668.43373
3,2017-01-01 00:30:00,6.121,75.0,0.083,0,0,0,0,0,0,1,0,0,0,1,28228.86076,18361.09422,18899.27711
4,2017-01-01 00:40:00,5.921,75.7,0.081,0,0,0,0,0,0,1,0,0,0,1,27335.6962,17872.34043,18442.40964
5,2017-01-01 00:50:00,5.853,76.9,0.081,0,0,0,0,0,0,1,0,0,0,1,26624.81013,17416.41337,18130.12048
6,2017-01-01 01:00:00,5.641,77.7,0.08,0,0,0,0,0,0,1,0,0,0,1,25998.98734,16993.31307,17945.06024
7,2017-01-01 01:10:00,5.496,78.2,0.085,0,0,0,0,0,0,1,0,0,0,1,25446.07595,16661.39818,17459.27711
8,2017-01-01 01:20:00,5.678,78.1,0.081,0,0,0,0,0,0,1,0,0,0,1,24777.72152,16227.35562,17025.54217
9,2017-01-01 01:30:00,5.491,77.3,0.082,0,0,0,0,0,0,1,0,0,0,1,24279.49367,15939.20973,16794.21687


In [None]:
features = tetouan_df_cleaned.drop(columns = ["Zone_1_PC","Zone_2_PC","Zone_3_PC"])
target = tetouan_df_cleaned["Zone_1_PC","Zone_2_PC","Zone_3_PC"]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [None]:
# Diffuse flows vs general diffuse flows ? 
# amount of diffuse solar radiation received ? compare to the general diffuse radiation
import seaborn as sns
import matplotlib.pyplot as plt
corr=np.abs(tetouan_df_cleaned.corr())

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 15))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()