In [1]:
# Import necessary libraries and modules
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import initial_exploration as explo
import data_cleaning as cl

# Suppress warnings to avoid clutter
warnings.filterwarnings("ignore")

# Set pandas display options for full column visibility
pd.set_option('display.max_columns', None)

# Enable autoreloading of external modules during the session
%load_ext autoreload
%autoreload 2


In [5]:
# Load the datasets
cash_requests = pd.read_csv('project_dataset/extract - cash request - data analyst.csv')
fees = pd.read_csv('project_dataset/extract - fees - data analyst - .csv')

# Display the first few rows of the cash_requests dataset
cash_requests.head()


Unnamed: 0,id,amount,status,created_at,updated_at,user_id,moderated_at,deleted_account_id,reimbursement_date,cash_request_received_date,money_back_date,transfer_type,send_at,recovery_status,reco_creation,reco_last_update
0,5,100.0,rejected,2019-12-10 19:05:21.596873+00,2019-12-11 16:47:42.40783+00,804.0,2019-12-11 16:47:42.405646+00,,2020-01-09 19:05:21.596363+00,,,regular,,,,
1,70,100.0,rejected,2019-12-10 19:50:12.34778+00,2019-12-11 14:24:22.900054+00,231.0,2019-12-11 14:24:22.897988+00,,2020-01-09 19:50:12.34778+00,,,regular,,,,
2,7,100.0,rejected,2019-12-10 19:13:35.82546+00,2019-12-11 09:46:59.779773+00,191.0,2019-12-11 09:46:59.777728+00,,2020-01-09 19:13:35.825041+00,,,regular,,,,
3,10,99.0,rejected,2019-12-10 19:16:10.880172+00,2019-12-18 14:26:18.136163+00,761.0,2019-12-18 14:26:18.128407+00,,2020-01-09 19:16:10.879606+00,,,regular,,,,
4,1594,100.0,rejected,2020-05-06 09:59:38.877376+00,2020-05-07 09:21:55.34008+00,7686.0,2020-05-07 09:21:55.320193+00,,2020-06-05 22:00:00+00,,,regular,,,,


In [7]:
# Initial exploration of the cash_requests dataset
explo.check(cash_requests)

# Convert date columns to datetime format in cash_requests
cl.convert_dates(cash_requests, cl.cash_request_date_columns)

# Ensure correct data types in the cash_requests dataset
cl.ensure_correct_data_types(cash_requests, cl.cash_request_date_columns)

# Repeat the initial exploration for the fees dataset
explo.check(fees)

# Convert date columns in fees dataset
cl.convert_dates(fees, cl.fees_data_date_columns)

# Ensure correct data types in the fees dataset
cl.ensure_correct_data_types(fees, cl.fees_data_date_columns)


Number of columns: 16 amd rows: 23970

Data types:
id                              int64
amount                        float64
status                         object
created_at                     object
updated_at                     object
user_id                       float64
moderated_at                   object
deleted_account_id            float64
reimbursement_date             object
cash_request_received_date     object
money_back_date                object
transfer_type                  object
send_at                        object
recovery_status                object
reco_creation                  object
reco_last_update               object
dtype: object

Unique values count:
id                            23970
amount                           41
status                            7
created_at                    23970
updated_at                    23970
user_id                       10798
moderated_at                  16035
deleted_account_id             1141
reimbursement_dat

Unnamed: 0,id,cash_request_id,type,status,category,total_amount,reason,created_at,updated_at,paid_at,from_date,to_date,charge_moment
0,6537,14941.0,instant_payment,rejected,,5.0,Instant Payment Cash Request 14941,2020-09-07,2020-10-13,2020-12-17,NaT,NaT,after
1,6961,11714.0,incident,accepted,rejected_direct_debit,5.0,rejected direct debit,2020-09-09,2020-10-13,2020-12-08,NaT,NaT,after
2,16296,23371.0,instant_payment,accepted,,5.0,Instant Payment Cash Request 23371,2020-10-23,2020-10-23,2020-11-04,NaT,NaT,after
3,20775,26772.0,instant_payment,accepted,,5.0,Instant Payment Cash Request 26772,2020-10-31,2020-10-31,2020-11-19,NaT,NaT,after
4,11242,19350.0,instant_payment,accepted,,5.0,Instant Payment Cash Request 19350,2020-10-06,2020-10-13,2020-11-02,NaT,NaT,after
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21056,12372,20262.0,instant_payment,rejected,,5.0,Instant Payment Cash Request 20262,2020-10-10,2020-10-13,2020-11-17,NaT,NaT,after
21057,20768,26764.0,instant_payment,rejected,,5.0,Instant Payment Cash Request 26764,2020-10-31,2020-10-31,2020-12-16,NaT,NaT,after
21058,18779,25331.0,instant_payment,rejected,,5.0,Instant Payment Cash Request 25331,2020-10-27,2020-10-27,2020-11-18,NaT,NaT,after
21059,16542,23628.0,instant_payment,rejected,,5.0,Instant Payment Cash Request 23628,2020-10-23,2020-10-23,2020-12-18,NaT,NaT,after


In [9]:
# Rename 'id' column to 'cash_request_id' in the cash_requests dataset
cash_requests = cl.rename_col(cash_requests, 'id', 'cash_request_id')

# Merge cash_requests and fees data on 'cash_request_id' column
data_df = cl.merge_df(cash_requests, fees, 'outer', 'cash_request_id')

# Rename columns to standardize naming convention
data_df = cl.rename_col_xy(data_df)

# Clean text column 'reason' for consistency
data_df = cl.clean_text_column(data_df, "reason")

# Column with the fee id values is rename to id_fee for better understanding
data_df = cl.rename_col(data_df, 'id', 'id_fee')

# Remove rows with missing values in the 'cash_request_id' column
data_df = cl.remove_nan(data_df, "amount")

# Drop unnecessary columns: 'id_fee' and 'category'
data_df = cl.drop_col(data_df, ["id_fee", "category"])

# Recheck the data after dropping columns
explo.check_null(data_df)


# Let's check this changes
data_df.head(20)


4 rows were removed

Count of null values:
cash_request_id                   0
amount                            0
CR_status                         0
CR_created_at                     0
CR_updated_at                     0
user_id                        2572
moderated_at                  10335
deleted_account_id            29521
reimbursement_date                0
cash_request_received_date     7945
money_back_date                8177
transfer_type                     0
send_at                        9724
recovery_status               24894
reco_creation                 24894
reco_last_update              24894
type                          11037
fee_status                    11037
total_amount                  11037
reason                        11037
fee_created_at                11037
fee_updated_at                11037
paid_at                       16656
from_date                     24328
to_date                       24328
charge_moment                 11037
dtype: int64


Unnamed: 0,cash_request_id,amount,CR_status,CR_created_at,CR_updated_at,user_id,moderated_at,deleted_account_id,reimbursement_date,cash_request_received_date,money_back_date,transfer_type,send_at,recovery_status,reco_creation,reco_last_update,type,fee_status,total_amount,reason,fee_created_at,fee_updated_at,paid_at,from_date,to_date,charge_moment
0,3.0,1.0,canceled,2019-11-19,2020-12-14,47.0,2019-11-20,,2019-12-05,NaT,NaT,regular,NaT,,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,
1,4.0,100.0,money_back,2019-12-09,2020-11-04,,2019-12-09,1309.0,2019-12-16,NaT,NaT,regular,NaT,,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,
2,5.0,100.0,rejected,2019-12-10,2019-12-11,804.0,2019-12-11,,2020-01-09,NaT,NaT,regular,NaT,,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,
3,6.0,100.0,direct_debit_rejected,2019-12-10,2020-12-18,812.0,2019-12-11,,2020-02-05,2019-12-11,NaT,regular,NaT,pending,2020-03-20,2020-12-18,,,,,NaT,NaT,NaT,NaT,NaT,
4,7.0,100.0,rejected,2019-12-10,2019-12-11,191.0,2019-12-11,,2020-01-09,NaT,NaT,regular,NaT,,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,
5,9.0,100.0,money_back,2019-12-10,2020-11-04,430.0,2019-12-11,,2020-01-08,2019-12-11,2020-05-12,regular,NaT,completed,2020-03-20,2020-05-12,,,,,NaT,NaT,NaT,NaT,NaT,
6,10.0,99.0,rejected,2019-12-10,2019-12-18,761.0,2019-12-18,,2020-01-09,NaT,NaT,regular,NaT,,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,
7,11.0,100.0,money_back,2019-12-10,2020-11-04,735.0,2019-12-11,,2019-12-30,2019-12-12,NaT,regular,NaT,,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,
8,13.0,100.0,direct_debit_rejected,2019-12-10,2020-11-04,406.0,2019-12-11,,2020-02-05,2019-12-12,NaT,regular,NaT,pending,2020-03-20,2020-03-31,,,,,NaT,NaT,NaT,NaT,NaT,
9,28.0,100.0,money_back,2019-12-10,2020-11-04,140.0,2019-12-11,,2020-01-08,NaT,NaT,regular,NaT,,NaT,NaT,,,,,NaT,NaT,NaT,NaT,NaT,
