# EDA


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
#quick EDA
def data_loading():
    #load datasets
    payments = pd.read_csv('data/payments.csv')
    clients  = pd.read_csv('data/clients.csv')

    return payments, clients   

def quick_EDA_check():

    #load datasets
    payments = pd.read_csv('data/payments.csv')
    clients  = pd.read_csv('data/clients.csv')

    #quick sanity checks
    print("=================== BASIC DATA INFO ==================")
    print(f"Payments Data Shape: {payments.shape}")
    print(f"Clients Data Shape: {clients.shape}")
    print("\n==================== PAYMENTS DATA HEAD =================")
    print(payments.head())
    print("\n==================== CLIENTS DATA HEAD =================")
    print(clients.head())

    #check data types
    print("\n==================== DATA TYPES =================")
    print("Payments Data Types:")
    print(payments.dtypes) 
    print("\nClients Data Types:")
    print(clients.dtypes)

    return payments, clients

# Data cleaning function
def data_cleaninig(payments, clients):
    #check for missing values
    print("\n==================== MISSING VALUES =================")
    print("Payments Missing Values:")
    print(payments.isnull().sum())
    print("\nClients Missing Values:")
    print(clients.isnull().sum())

    #convert date columns from Epoch to datetime
    print("\n==================== CONVERTING DATE COLUMNS =================")
    payments['transaction_date'] = pd.to_datetime(payments['transaction_date'], unit='s')
    print(f"data_range: {payments['transaction_date'].min()} to {payments['transaction_date'].max()}")

    #basic data validation
    print("\n==================== BASIC DATA VALIDATION =================")
    print(f"Unique payment codes: {payments['payment_code'].nunique()}")
    #check for negative payment amounts
    print(f"Negative amounts:{(payments['payment_amt']< 0).sum()}")
    negative_payments = payments[payments['payment_amt'] < 0]
    if not negative_payments.empty:
        print("Negative Payments Found:")
        print(negative_payments)
    else:
        print("No Negative Payments Found")
        
    #check for DEFAULT payment codes
    default_codes = payments[payments['payment_code'] == 'DEFAULT']
    default_count = default_codes.shape[0]
    print(f"Number of DEFAULT payment codes: {default_count}")

    return payments, clients

#main function to run the analysis
def main():
    payments, clients = data_loading()
    #quick_EDA_check()
    data_cleaninig(payments, clients)

if __name__ == "__main__":
    main()


Payments Missing Values:
transaction_id      0
contract_id         0
client_id           0
transaction_date    0
payment_amt         0
payment_code        0
dtype: int64

Clients Missing Values:
client_id                  0
entity_type                0
entity_year_established    0
dtype: int64

data_range: 2017-07-02 18:08:02 to 2018-07-24 18:08:10

Unique payment codes: 2
Negative amounts:2
Negative Payments Found:
       transaction_id  contract_id  client_id    transaction_date  \
5271            16270          564        259 2018-04-03 18:08:25   
20089           18574          451        937 2018-05-03 18:08:28   

       payment_amt payment_code  
5271        -55.00      PAYMENT  
20089      -136.66      PAYMENT  
Number of DEFAULT payment codes: 2219


## Figure out the meaning of negative payment_amt

In [None]:
print("=== UNDERSTANDING NEGATIVE AMOUNTS ===")
payments = pd.read_csv('data/payments.csv')

# 1.  basic stats on negative payment amounts
negative_payments = payments[payments['payment_amt'] < 0]
print(f"Negative amount records: {len(negative_payments)} ({len(negative_payments)/len(payments)*100:.2f}%)")
print(f"Negative amount range: ${negative_payments['payment_amt'].min():.2f} to ${negative_payments['payment_amt'].max():.2f}")

# 2. check the paymend codes associated with negative amounts
print("\nNegative amounts by payment code:")
print(negative_payments['payment_code'].value_counts())

# 3. check the first 5 records with negative amounts
print("\nSample negative amount records:")
print(negative_payments[['client_id', 'contract_id', 'payment_amt', 'payment_code']].head())

=== UNDERSTANDING NEGATIVE AMOUNTS ===
Negative amount records: 2 (0.01%)
Negative amount range: $-136.66 to $-55.00

Negative amounts by payment code:
payment_code
PAYMENT    2
Name: count, dtype: int64

Sample negative amount records:
       client_id  contract_id  payment_amt payment_code
5271         259          564       -55.00      PAYMENT
20089        937          451      -136.66      PAYMENT


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')  

#set up plotting style
plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial', 'Liberation Sans']
plt.rxParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

class PaymentDefaultEDA:
    def __init__ (self):
        self.payments = None
        self.clients = None
        self.merged_data = None
        