In [9]:
import pandas as pd
import warnings

# Suppress warnings:
warnings.filterwarnings("ignore")

# Importing functions for exploration and cleaning:
import initial_exploration as explo
import data_cleaning as cl

# Autoreload to automatically update modules:
%load_ext autoreload
%autoreload 2

# Display all columns:
pd.set_option('display.max_columns', None)

# Load datasets:
cash_requests = pd.read_csv('project_dataset/extract - cash request - data analyst.csv')
fees = pd.read_csv('project_dataset/extract - fees - data analyst - .csv')

# Initial exploration on cash_requests:
explo.check(cash_requests)

# Ensure correct data types for cash_requests:
cl.ensure_correct_data_types(cash_requests, cl.cash_request_date_columns)

# Print data types and inspect first few rows:
print(cash_requests.dtypes)
cash_requests.head()

# Initial exploration on fees:
explo.check(fees)

# Ensure correct data types for fees:
cl.ensure_correct_data_types(fees, cl.fees_data_date_columns)

# Print data types and inspect first few rows of fees:
print(fees.dtypes)
fees.head()

# Merging datasets:
data_df = cl.merge_df(cash_requests, fees, 'outer', 'id', 'cash_request_id')

# Inspect the merged data
print(data_df.head())
print(data_df.dtypes)

# Remove NaN values from "id_x":
data_df = cl.remove_nan(data_df, "id_x")

# Check for missing values:
explo.check_null(data_df)

# Selecting data types:
frames = cl.selecting_data_types(data_df)
num, cat = frames[1], frames[0]

# Inspect first few rows of numerical and categorical columns:
print(num.head())  # Numerical columns
print(cat.head())  # Categorical columns

# Re-check and ensure correct data types in cash_requests:
cl.ensure_correct_data_types(cash_requests, cl.cash_request_date_columns)

# Verify specific date columns' data types in cash_requests:
print("\nCheck data types of date columns:")
print(cash_requests[['created_at', 'updated_at', 'reimbursement_date', 
                     'cash_request_received_date', 'money_back_date']].dtypes)

# Check for any NaT in date columns:
print("\nCheck NaT (missing values) in date columns:")
print(cash_requests[['created_at', 'updated_at', 'reimbursement_date', 
                     'cash_request_received_date', 'money_back_date']].isna().sum())


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Number of columns: 16 amd rows: 23970

Data types:
id                              int64
amount                        float64
status                         object
created_at                     object
updated_at                     object
user_id                       float64
moderated_at                   object
deleted_account_id            float64
reimbursement_date             object
cash_request_received_date     object
money_back_date                object
transfer_type                  object
send_at                        object
recovery_status                object
reco_creation                  object
reco_last_update               object
dtype: object

Unique values count:
id                            23970
amount                           41
status                            7
created_at                    23970
updated_at                    23970
user_id                       10798
m