In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import logging

In [2]:
logging.basicConfig(level=logging.INFO, format='%(levelname)s : %(message)s')

In [3]:
def load_data(filepath: str):
    try:
        df = pd.read_csv('data/E-commerce.csv') # load csv into the env as a pandas dataFrame
        #prints out the number of rows and columns (5630 rows, 20 columns)
        logging.info(f'Data successfullt loaded with {df.shape[0]} rows and {df.shape[1]} columns')
    except FileNotFoundError:
        logging.info('File Not Found! Please check filepath and try again')
        raise

In [4]:
# ----dataset overview--------
def dataset_overview(df: pd.DataFrame):
    logging.info(f'Number of observations : {df.shape[0]}')
    logging.info(f'Number of features : {df.shape[1]}')
    return df.describe(include='all').T

In [5]:
def numeric_columns(df: pd.DataFrame):
    # -------numeric columns-----------
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    cols = [col for col in numeric_cols[:1]]
    logging.info(f'Number of Numeric columns : {len(cols)} | Examples : {cols[:3]}')

    for i,cols in enumerate(numeric_cols,1):
        logging.info(f'{i}. {cols} - Min: {df[cols].min()} - Max: {df[cols].max()}')
    return numeric_cols

In [6]:
def categorical_columns(df: pd.DataFrame):
    # ------------categorical columns---------
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns

    cols = [col for col in categorical_cols]
    logging.info(f'Number of Categorical columns : {len(cols)} | Examples : {cols[:3]}')

    for i, cols in enumerate(categorical_cols,1):
        uniques = df[cols].unique()
        logging.info(f'{i}. {cols} - Unique: {df[cols].nuqiue()} | Examples : {uniques[:3]}')
    return categorical_cols

In [7]:
# Numerical Columns Description
# Churn : Target variable (0 = customer stayed, 1 = customer churned/left)
# Tenure : How long the customer has stayed with the company
# CityTier : Classification of the city where the customer lives
# WarehouseToHome : Distance between the warehouse and the customer's home
# HoursSpendOnApp : Average hours the customer spends on the app per day/week
# NumberOfDeviceRegistered : Number of devices registered to a cutomer's account
# SatisfactionScore: A customer's satisfaction rating 
# NumberOfAddress : How many addresses the customer has saved
# Complain : Whether the customer has filed a complaint or not
# OrderAmountHikeFromlastYear : Percentage increase in order compared to last year
# CouponUsed : Number of coupons used by the customer
# OrderCount : Number of orders placed by the customer
# DaysSinceLastOrder : Number of days since the customer's last order
# CashbackAmount : Total cashback the customer has received

In [8]:
# Categorical columns description
# PreferredLoginDevice - The device most often used to log into the app/site
# PreferredPaymentMode - Payment method most often used
# Gender - Sex of the customer (male/female)
# PreferredOrderCat - Most frequent product category ordered
# MaritalStatus - Marital status of the customer

In [9]:
#missing data
def missing_data(df: pd.DataFrame):
    missing = df.isnull().sum()
    missing = missing[missing>0].sort_values(ascending=False)
    missing_pct = missing / len(df)
    logging.info('Missing Data \n')
    missing_df = pd.DataFrame({
        'missing value' : missing,
        'missing pct' : missing_pct
    })
    return missing_df

In [10]:
#------duplicated rows--------
def duplicate(df: pd.DataFrame):
    duplicates  =  df[df.duplicated()]
    logging.info(f'Number of duplicates : {len(duplicates)}')
    if len(duplicates) == 0:
        logging.info('No duplicates found')
    else:
        return duplicates

In [11]:
# ---------outlier detection using IQR--------
def check_outlier(df: pd.DataFrame, col: str):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)

    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outliers, lower_bound, upper_bound

In [12]:
def outlier_summary(df: pd.DataFrame, numeric_cols: list[str]):
    result = []
    logging.info('Outlier Summary\n')
    for i,col in enumerate(numeric_cols,1):
        outlier, lower, upper = check_outlier(df, numeric_cols)
        result = {
            'index': i,
            'columns' : col,
            'outlier' : len(outlier),
            'Lower Range' : lower,
            'Upper Range' : upper
        }
    summary_df = pd.DataFrame(result)
    return summary_df

In [13]:
import os
def save_summary(df: pd.DataFrame, name: str):
    os.makedirs('eda_reports',exist_ok=True)
    path = f'eda_reports{name}.csv'
    df.to_csv(path, index=False)
    logging.info(f'Saved report: {path}')

In [14]:
def run_eda(filepath):
    df = load_data(filepath)
    overview = dataset_overview(df)
    num_cols = numeric_columns(df)
    cat_cols = categorical_columns(df)
    missing = missing_data(df)
    duplicates = duplicate(df)
    outliers = outlier_summary(df, num_cols)

    save_summary(missing, 'missing_data')
    save_summary(outliers, 'outlier_summary')
    
    return {
        'data' : df,
        'overview' : overview,
        'num_cols' :num_cols,
        'cat_cols' : cat_cols,
        'missing' : missing,
        'duplicates' : duplicates,
        'outliers' : outliers
    }

In [16]:
if __name__ == '__main__':
    run_eda('E-commerce.csv')

INFO : File Not Found! Please check filepath and try again


FileNotFoundError: [Errno 2] No such file or directory: 'data/E-commerce.csv'