In [1]:
# Import Data Manipulation Libraries
import numpy as np
import pandas as pd

# Import Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Filter warning Libraries
import warnings
warnings.filterwarnings('ignore')

# Import Logging
import logging
logging.basicConfig(level = logging.INFO,
                    filename = 'energyconsumption.log',
                    filemode = 'w',
                    format = '%(asctime)s - %(message)s - %(levelname)s',
                    force = True)

# Import OrderedDict Function
from collections import OrderedDict

In [2]:
# Step1: Data Ingestion

def data_ingestion():

  try:
    df = pd.read_csv(r'C:\energycosumption_model\data\raw\Energy_consumption.csv')
    logging.info("Dataset successfully uploaded")

  except:
    logging.info("Check the location of file")
  return df

# Step2: Data Exploration
def data_exploration(df):
  # Segregate Numerical and Categorical Columns
  numerical_col = df.select_dtypes(exclude  = 'object').columns
  categorical_col = df.select_dtypes(include = 'object').columns

  # Numerical Descriptive Stats
  numerical_stats = []

  Q1 = df[numerical_col].quantile(0.25)
  Q3 = df[numerical_col].quantile(0.75)
  IQR = Q3 - Q1
  LW = Q1 - 1.5*IQR
  UW = Q3 + 1.5*IQR
  Outlier_Count = (df[numerical_col] < LW) | (df[numerical_col] > UW)
  Outlier_Percentage = (Outlier_Count.sum()/len(df))*100

  for i in numerical_col:
    num_stats = OrderedDict({
        "Feature":i,
        "Count":df[i].count(),
        "Maximum":df[i].max(),
        "Minimum":df[i].min(),
        "Mean":df[i].mean(),
        "Median":df[i].median(),
        "Q1":Q1,
        "Q3":Q3,
        "IQR":IQR,
        "Lower_Whisker":LW,
        "Upper_Whisker":UW,
        "Outlier_Count": Outlier_Count.sum(),
        "Outlier_Percentage":Outlier_Percentage,
        "Skewness":df[i].skew(),
        "Kurtosis":df[i].kurtosis(),
        "Standard Deviation":df[i].std()

    })

    numerical_stats.append(num_stats)
    numerical_stats_report = pd.DataFrame(numerical_stats)

  # Categorical Descriptive Stats

  categorical_stats = []

  for i in categorical_col:
    cat_stats = OrderedDict({
        "Feature":i,
        "Count":df[i].count(),
        "Unique_Count":df[i].nunique(),
        "Mode":df[i].mode(),
        "Value_Counts":df[i].value_counts()
    })
    categorical_stats.append(cat_stats)
    categorical_stats_report = pd.DataFrame(categorical_stats)

  return numerical_stats_report, categorical_stats_report

# Step3: Dataset Information
def dataset_info(df):
  print(df.info())