# 📦 Exploratory Data Analysis - Stori Challenge

In [19]:
# === 1. Imports & Config ===
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# === 2. Load Data ===
users = pd.read_csv('../data/raw/user_info.csv')
transactions = pd.read_csv('../data/raw/transaction_info.csv')
deliveries = pd.read_csv('../data/raw/package_delivery_info.csv')

# === 3. Preview Datasets ===
def preview(df, name):
    print(f"\n📄 {name.upper()} — Shape: {df.shape}")
    display(df.head())

preview(users, "Users")
preview(transactions, "Transactions")
preview(deliveries, "Deliveries")


📄 USERS — Shape: (10002, 4)


Unnamed: 0,user_id,name,email,join_date
0,1d10b266-b7b2-4258-88b3-3957de0461ea,Angela Smith,nfischer@yahoo.com,02/06/2023
1,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143,Mr. John Powell,rebeccabarnes@yahoo.com,2022-05-04
2,616735b6-b8db-4517-bb3f-8171ec2a6166,Susan Chase,riverschristopher@davis.net,2022-01-01
3,a6266c94-16ee-4914-a587-a3d43a81f913,Sally Smith,mswanson@mitchell-cooper.com,07/20/2021
4,ccac9e39-e865-4a9f-acc3-1cf90c862ede,Albert Cordova,uhicks@macias-patterson.com,2023-09-26



📄 TRANSACTIONS — Shape: (10002, 5)


Unnamed: 0,transaction_id,user_id,amount,timestamp,transaction_type
0,8b266b46-9ec7-4010-83e4-004b3bef400f,1d10b266-b7b2-4258-88b3-3957de0461ea,865.48,2023-06-11 06:12:30,In-Store
1,98ae73d5-6918-4790-9d63-eedf8ad43a0b,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143,416.5,2023.08.30 09:26,Subscription
2,483b1462-9efe-4e9e-a659-da11640c3201,616735b6-b8db-4517-bb3f-8171ec2a6166,473.42,11/22/2023 01:09 AM,Online
3,9954f0e8-487a-4bb8-9596-41d2dbfec7b4,a6266c94-16ee-4914-a587-a3d43a81f913,307.0,05/20/2023 04:46 AM,In-Store
4,ee42937c-7f14-4782-82b9-dfa0f14773e5,ccac9e39-e865-4a9f-acc3-1cf90c862ede,183.0,2023.11.05 23:21,Subscription



📄 DELIVERIES — Shape: (10002, 5)


Unnamed: 0,package_id,courier,delivery_date,delivery_status,user_id
0,80c09506-8a61-454a-a226-22e639d8795b,DHL,09-12-2022,Delivered,1d10b266-b7b2-4258-88b3-3957de0461ea
1,2e321909-afe9-4ed2-9065-05425bce5394,ups,05/16/2021,In Transit,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143
2,89b8b5d6-b90e-4a29-9d1e-a9a9db4d1e03,UPS,2022-08-30,Pending,616735b6-b8db-4517-bb3f-8171ec2a6166
3,f13aa104-f973-4a2e-bac0-6a04ac86b9ae,DHL,12/08/2021,In Transit,a6266c94-16ee-4914-a587-a3d43a81f913
4,51f15194-b5ad-4062-9bd2-16738f9667f0,UPS,2022.12.05,Lost,ccac9e39-e865-4a9f-acc3-1cf90c862ede


In [20]:
# === 4. Missing Values & Duplicates ===
def nulls_and_duplicates(df, name):
    print(f"\n🔎 {name.upper()} — Nulls and Duplicates:")
    print("Nulls:\n", df.isnull().sum())
    print(f"Duplicate rows: {df.duplicated().sum()}")
    print("-" * 50)

nulls_and_duplicates(users, "Users")
nulls_and_duplicates(transactions, "Transactions")
nulls_and_duplicates(deliveries, "Deliveries")


🔎 USERS — Nulls and Duplicates:
Nulls:
 user_id      0
name         0
email        0
join_date    0
dtype: int64
Duplicate rows: 2
--------------------------------------------------

🔎 TRANSACTIONS — Nulls and Duplicates:
Nulls:
 transaction_id      0
user_id             0
amount              0
timestamp           0
transaction_type    0
dtype: int64
Duplicate rows: 2
--------------------------------------------------

🔎 DELIVERIES — Nulls and Duplicates:
Nulls:
 package_id         0
courier            0
delivery_date      0
delivery_status    0
user_id            0
dtype: int64
Duplicate rows: 2
--------------------------------------------------


In [11]:
# === 5. Descriptive Statistics ===
def describe_df(df, name):
    print(f"\n📊 Descriptive Stats — {name}")
    display(df.describe(include='all'))

describe_df(users, "Users")
describe_df(transactions, "Transactions")
describe_df(deliveries, "Deliveries")




Data dictioanry for: Users
Column                    Type            Nulls      Example                       
--------------------------------------------------------------------------------
user_id                   object          0          1d10b266-b7b2-4258-88b3-3957de
name                      object          0          Angela Smith                  
email                     object          0          nfischer@yahoo.com            
join_date                 object          0          02/06/2023                    

Data dictioanry for: Transactions
Column                    Type            Nulls      Example                       
--------------------------------------------------------------------------------
transaction_id            object          0          8b266b46-9ec7-4010-83e4-004b3b
user_id                   object          0          1d10b266-b7b2-4258-88b3-3957de
amount                    float64         0          865.48                        
timestamp          

In [21]:
# Define a function to check duplicates
def check_duplicates(df, df_name):
    total_rows = len(df)
    duplicate_rows = df[df.duplicated()]
    num_duplicates = len(duplicate_rows)
    
    print(f"🔍 {df_name} — Total rows: {total_rows}")
    print(f"⚠️ {num_duplicates} duplicate rows found.")
    
    if num_duplicates > 0:
        display(duplicate_rows.head())  # Show a few duplicates
    print('-' * 50)

# Example usage
check_duplicates(users, "Users")
check_duplicates(transactions, "Transactions")
check_duplicates(deliveries, "Deliveries")


🔍 Users — Total rows: 10002
⚠️ 2 duplicate rows found.


Unnamed: 0,user_id,name,email,join_date
10000,1d10b266-b7b2-4258-88b3-3957de0461ea,Angela Smith,nfischer@yahoo.com,02/06/2023
10001,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143,Mr. John Powell,rebeccabarnes@yahoo.com,2022-05-04


--------------------------------------------------
🔍 Transactions — Total rows: 10002
⚠️ 2 duplicate rows found.


Unnamed: 0,transaction_id,user_id,amount,timestamp,transaction_type
10000,8b266b46-9ec7-4010-83e4-004b3bef400f,1d10b266-b7b2-4258-88b3-3957de0461ea,865.48,2023-06-11 06:12:30,In-Store
10001,98ae73d5-6918-4790-9d63-eedf8ad43a0b,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143,416.5,2023.08.30 09:26,Subscription


--------------------------------------------------
🔍 Deliveries — Total rows: 10002
⚠️ 2 duplicate rows found.


Unnamed: 0,package_id,courier,delivery_date,delivery_status,user_id
10000,80c09506-8a61-454a-a226-22e639d8795b,DHL,09-12-2022,Delivered,1d10b266-b7b2-4258-88b3-3957de0461ea
10001,2e321909-afe9-4ed2-9065-05425bce5394,ups,05/16/2021,In Transit,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143


--------------------------------------------------


In [22]:
# === 6. Unique Values Per Column ===
def unique_summary(df, name):
    print(f"\n🔢 Unique Values in {name}:")
    for col in df.columns:
        uniques = df[col].dropna().unique()
        print(f"{col}: {len(uniques)} unique values")
        if len(uniques) <= 10:
            print(f" - {uniques}")

unique_summary(users, "Users")
unique_summary(transactions, "Transactions")
unique_summary(deliveries, "Deliveries")



🔢 Unique Values in Users:
user_id: 10000 unique values
name: 9362 unique values
email: 9934 unique values
join_date: 3955 unique values

🔢 Unique Values in Transactions:
transaction_id: 10000 unique values
user_id: 10000 unique values
amount: 8700 unique values
timestamp: 9983 unique values
transaction_type: 3 unique values
 - ['In-Store' 'Subscription' 'Online']

🔢 Unique Values in Deliveries:
package_id: 10000 unique values
courier: 5 unique values
 - ['DHL' 'ups' 'UPS' 'FEDEX' 'DHL   ']
delivery_date: 3921 unique values
delivery_status: 4 unique values
 - ['Delivered' 'In Transit' 'Pending' 'Lost']
user_id: 10000 unique values


In [23]:
# === 7. Generate Data Dictionary Preview ===
def generate_dictionary(df, df_name):
    print(f"\n📘 Data Dictionary Preview — {df_name}")
    print(f"{'Column':<25} {'Type':<15} {'Nulls':<10} {'Example':<30}")
    print("-" * 80)
    for col in df.columns:
        dtype = df[col].dtype
        nulls = df[col].isnull().sum()
        example = df[col].dropna().unique()[0] if not df[col].dropna().empty else "N/A"
        print(f"{col:<25} {str(dtype):<15} {nulls:<10} {str(example)[:30]:<30}")

generate_dictionary(users, "Users")
generate_dictionary(transactions, "Transactions")
generate_dictionary(deliveries, "Deliveries")



📘 Data Dictionary Preview — Users
Column                    Type            Nulls      Example                       
--------------------------------------------------------------------------------
user_id                   object          0          1d10b266-b7b2-4258-88b3-3957de
name                      object          0          Angela Smith                  
email                     object          0          nfischer@yahoo.com            
join_date                 object          0          02/06/2023                    

📘 Data Dictionary Preview — Transactions
Column                    Type            Nulls      Example                       
--------------------------------------------------------------------------------
transaction_id            object          0          8b266b46-9ec7-4010-83e4-004b3b
user_id                   object          0          1d10b266-b7b2-4258-88b3-3957de
amount                    float64         0          865.48                        
times

In [24]:
# === 8. Check Exact Duplicate Rows ===
def check_duplicates(df, name):
    dupes = df[df.duplicated()]
    print(f"\n📌 {name} — {len(dupes)} exact duplicate rows found.")
    if not dupes.empty:
        display(dupes.head())
    print("-" * 50)

check_duplicates(users, "Users")
check_duplicates(transactions, "Transactions")
check_duplicates(deliveries, "Deliveries")



📌 Users — 2 exact duplicate rows found.


Unnamed: 0,user_id,name,email,join_date
10000,1d10b266-b7b2-4258-88b3-3957de0461ea,Angela Smith,nfischer@yahoo.com,02/06/2023
10001,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143,Mr. John Powell,rebeccabarnes@yahoo.com,2022-05-04


--------------------------------------------------

📌 Transactions — 2 exact duplicate rows found.


Unnamed: 0,transaction_id,user_id,amount,timestamp,transaction_type
10000,8b266b46-9ec7-4010-83e4-004b3bef400f,1d10b266-b7b2-4258-88b3-3957de0461ea,865.48,2023-06-11 06:12:30,In-Store
10001,98ae73d5-6918-4790-9d63-eedf8ad43a0b,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143,416.5,2023.08.30 09:26,Subscription


--------------------------------------------------

📌 Deliveries — 2 exact duplicate rows found.


Unnamed: 0,package_id,courier,delivery_date,delivery_status,user_id
10000,80c09506-8a61-454a-a226-22e639d8795b,DHL,09-12-2022,Delivered,1d10b266-b7b2-4258-88b3-3957de0461ea
10001,2e321909-afe9-4ed2-9065-05425bce5394,ups,05/16/2021,In Transit,bf04c0b9-8a09-49e4-b4a5-0b2915bc5143


--------------------------------------------------
