### **<h1>Data Consumer A - (נתוני צריכה צרכן א)</h1>**

In [17]:
# Imports
import pandas as pd
import plotly.graph_objects as go
# Permission to access my Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FILE_PATH = "/content/drive/MyDrive/Doral_Energy/data/raw_data/נתוני צריכה צרכן א.xlsx"

  

Mounted at /content/drive


# Load the data

In [18]:
df = pd.read_excel(FILE_PATH)
df

Unnamed: 0,מתאריך,שעה,כמות kWh
0,2021-12-01,00:00:00,1287
1,2021-12-01,00:30:00,1217
2,2021-12-01,01:00:00,1216
3,2021-12-01,01:30:00,1212
4,2021-12-01,02:00:00,1206
...,...,...,...
17515,2022-11-30,21:30:00,1578
17516,2022-11-30,22:00:00,1590
17517,2022-11-30,22:30:00,1586
17518,2022-11-30,23:00:00,1583


# Change column names from Hebrew to English 


In [19]:
df.rename(columns={'מתאריך': 'date', 'שעה': 'hour', 'כמות kWh': 'kWh_amount'}, inplace=True)
df.head()

Unnamed: 0,date,hour,kWh_amount
0,2021-12-01,00:00:00,1287
1,2021-12-01,00:30:00,1217
2,2021-12-01,01:00:00,1216
3,2021-12-01,01:30:00,1212
4,2021-12-01,02:00:00,1206


# Check the types

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        17520 non-null  datetime64[ns]
 1   hour        17520 non-null  object        
 2   kWh_amount  17520 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 410.8+ KB


# Convert date format and convert kwh_amount from int to float

In [25]:
df['date'] = df['date'].dt.strftime('%d/%m/%Y')
df['kWh_amount'] = df['kWh_amount'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        17520 non-null  object 
 1   hour        17520 non-null  object 
 2   kWh_amount  17520 non-null  float64
dtypes: float64(1), object(2)
memory usage: 410.8+ KB


#Get some understanding about the range of Kwh


In [26]:
df.describe()

Unnamed: 0,kWh_amount
count,17520.0
mean,580.85976
std,592.522665
min,0.0
25%,47.0
50%,103.0
75%,1208.0
max,1627.0


# Check if there are missing values


In [27]:
df.isna().any(axis=1).sum()

0

# Are there any duplicates?


In [28]:
df.duplicated().sum()

1

In [None]:
# Display the duplicate row
df[df.duplicated()]

Unnamed: 0,date,hour,kWh_amount
15987,30/10/2022,01:30:00,1329.0


Just to make sure I looked at the excel and those rows are similar!
Therefore I will drop one of them

image.png
image.png

In [None]:
# Drop this row
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)


In [None]:
df

Unnamed: 0,date,hour,kWh_amount
0,01/12/2021,00:00:00,1287.0
1,01/12/2021,00:30:00,1217.0
2,01/12/2021,01:00:00,1216.0
3,01/12/2021,01:30:00,1212.0
4,01/12/2021,02:00:00,1206.0
...,...,...,...
17514,30/11/2022,21:30:00,1578.0
17515,30/11/2022,22:00:00,1590.0
17516,30/11/2022,22:30:00,1586.0
17517,30/11/2022,23:00:00,1583.0


In [None]:
# group by date and hour, and check if there are rows with different kmh amount values
grouped_df = df.groupby([df['date'], df['hour']])
has_diff_kwh = grouped_df['kWh_amount'].nunique() > 1

grouped_df = df.groupby([df['date'], df['hour']]).filter(lambda x: has_diff_kwh[x.name])
grouped_df


Unnamed: 0,date,hour,kWh_amount
15984,30/10/2022,01:00:00,1509.0
15986,30/10/2022,01:00:00,1329.0


It seems that there was a mistake in the data collection process and 2 different values has been written for the exact date and same hour

In [None]:
# calculate the average kwh amount
mean_kWh = grouped_df.kWh_amount.mean()
# save it in one of the rows
df.at[15984, 'kWh_amount'] = mean_kWh
# drop one of them, and reset the index
df.drop(index=15986, inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,date,hour,kWh_amount
0,01/12/2021,00:00:00,1287.0
1,01/12/2021,00:30:00,1217.0
2,01/12/2021,01:00:00,1216.0
3,01/12/2021,01:30:00,1212.0
4,01/12/2021,02:00:00,1206.0
...,...,...,...
17513,30/11/2022,21:30:00,1578.0
17514,30/11/2022,22:00:00,1590.0
17515,30/11/2022,22:30:00,1586.0
17516,30/11/2022,23:00:00,1583.0


#Outliers Detection


In [29]:
fig = go.Figure()
fig.add_trace(go.Box(name='kWh amount', y=df.kWh_amount, boxmean='sd'))

fig.update_traces(
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
)
fig.show()

**In this boxplot there are no unusual observations**

# Save the changes to excel file


In [30]:
df.to_csv('consumer_a.csv', index=False)