### **<h1>Data Consumer B - (נתוני צריכה צרכן ב)</h1>**

In [1]:
# Imports
import pandas as pd
import plotly.graph_objects as go

# Permission to access my Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FILE_PATH = "/content/drive/MyDrive/Doral_Energy/data/raw_data/נתוני צריכה צרכן ב.xlsx"

  

Mounted at /content/drive


# Load the data

In [2]:
df = pd.read_excel(FILE_PATH)
df

Unnamed: 0,תאריך,Unnamed: 1,"קוט""ש"
0,2021-01-01,00:00:00,309.20
1,2021-01-01,00:30:00,283.26
2,2021-01-01,01:00:00,280.26
3,2021-01-01,01:30:00,263.42
4,2021-01-01,02:00:00,247.60
...,...,...,...
17513,2021-12-31,21:30:00,431.64
17514,2021-12-31,22:00:00,418.26
17515,2021-12-31,22:30:00,396.44
17516,2021-12-31,23:00:00,374.86


# Change column names from Hebrew to English 


In [3]:
df.rename(columns={'תאריך': 'date', 'Unnamed: 1': 'hour', 'קוט"ש': 'kWh_amount'}, inplace=True)
df.head()

Unnamed: 0,date,hour,kWh_amount
0,2021-01-01,00:00:00,309.2
1,2021-01-01,00:30:00,283.26
2,2021-01-01,01:00:00,280.26
3,2021-01-01,01:30:00,263.42
4,2021-01-01,02:00:00,247.6


# Check the types

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17518 entries, 0 to 17517
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        17518 non-null  datetime64[ns]
 1   hour        17518 non-null  object        
 2   kWh_amount  17518 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 410.7+ KB


# Convert date format

In [9]:
df['date'] = df['date'].dt.strftime('%d/%m/%Y')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17518 entries, 0 to 17517
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        17518 non-null  object 
 1   hour        17518 non-null  object 
 2   kWh_amount  17518 non-null  float64
dtypes: float64(1), object(2)
memory usage: 410.7+ KB


#Get some understanding about the range of Kwh


In [10]:
df.describe()

Unnamed: 0,kWh_amount
count,17518.0
mean,361.433602
std,101.979695
min,0.0
25%,284.265
50%,360.19
75%,440.32
max,906.32


# Check if there are missing values


In [6]:
df.isna().any(axis=1).sum()

0

# Are there any duplicates?


In [7]:
df.duplicated().sum()

0

In [11]:
# group by date and hour, and check if there are rows with different kmh amount values
grouped_df = df.groupby([df['date'], df['hour']])
has_diff_kwh = grouped_df['kWh_amount'].nunique() > 1

grouped_df = df.groupby([df['date'], df['hour']]).filter(lambda x: has_diff_kwh[x.name])
grouped_df


Unnamed: 0,date,hour,kWh_amount


**There are no duplicates for the same date and hour with different values**

#Outliers Detection


In [12]:
fig = go.Figure()
fig.add_trace(go.Box(name='kWh amount', y=df.kWh_amount, boxmean='sd'))

fig.update_traces(
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
)
fig.show()

Get the outliers cases

In [None]:
def get_outliers(df, series):
  q1 = series.quantile(0.25)
  q3 = series.quantile(0.75)

  if q1*q3 == 0:
    iqr = abs(2*(q1+q3))
    toprange = iqr
    botrange = -toprange
  else:
    iqr = q3-q1
    toprange = q3 + iqr * 1.5
    botrange = q1 - iqr * 1.5

  outliers_top=df[series > toprange]
  outliers_bot= df[series < botrange]
  outliers = pd.concat([outliers_bot, outliers_top], axis=0)

  return (botrange, toprange, outliers)

In [None]:
botrange, toprange, outliers = get_outliers(df, df.kWh_amount)
print(toprange)
print(botrange)

outliers

674.4025
50.182499999999976


Unnamed: 0,date,hour,kWh_amount
4676,08/04/2021,11:00:00,25.88
4677,08/04/2021,11:30:00,7.06
4873,12/04/2021,13:30:00,40.4
5298,21/04/2021,10:00:00,46.34
13936,18/10/2021,09:00:00,6.2
13937,18/10/2021,09:30:00,0.0
13938,18/10/2021,10:00:00,0.0
13939,18/10/2021,10:30:00,0.0
1044,22/01/2021,18:00:00,678.96
14545,31/10/2021,01:30:00,906.32


it's difficult to determine the reason behind these extreme values without additional context. However, there are a few potential reasons why these outliers might occur:
1. Meter malfunction: It's possible that the meter used to measure the electricity consumption is faulty, leading to inaccurate readings.
2. Data entry errors: The outliers could be a result of errors during data entry, such as typos or incorrect units.
3. Extreme weather conditions: The consumption of electricity can vary widely depending on weather conditions. Extremely hot or cold weather can lead to increased usage of air conditioning or heating, respectively.
4. Human error: The user could have made a mistake, such as leaving an appliance running for an extended period of time or forgetting to turn off lights.


# Save the changes to excel file


In [13]:
df.to_csv('consumer_b.csv', index=False)