In [None]:
# Import necessary libraries and modules
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import initial_exploration as explo
import data_cleaning as cl

# Suppress warnings to avoid clutter
warnings.filterwarnings("ignore")

# Set pandas display options for full column visibility
pd.set_option('display.max_columns', None)

# Enable autoreloading of external modules during the session
%load_ext autoreload
%autoreload 2


In [None]:
# Load the datasets
cash_requests = pd.read_csv('project_dataset/extract - cash request - data analyst.csv')
fees = pd.read_csv('project_dataset/extract - fees - data analyst - .csv')

# Display the first few rows of the cash_requests dataset
cash_requests.head()


In [None]:
# Initial exploration of the cash_requests dataset
explo.check(cash_requests)

# Convert date columns to datetime format in cash_requests
cl.convert_dates(cash_requests, cl.cash_request_date_columns)

# Ensure correct data types in the cash_requests dataset
cl.ensure_correct_data_types(cash_requests, cl.cash_request_date_columns)

# Repeat the initial exploration for the fees dataset
explo.check(fees)

# Convert date columns in fees dataset
cl.convert_dates(fees, cl.fees_data_date_columns)

# Ensure correct data types in the fees dataset
cl.ensure_correct_data_types(fees, cl.fees_data_date_columns)


In [None]:
# Rename 'id' column to 'cash_request_id' in the cash_requests dataset
cl.rename_col(cash_requests, 'id', 'cash_request_id')

# Merge cash_requests and fees data on 'cash_request_id' column
data_df = cl.merge_df(cash_requests, fees, 'outer', 'cash_request_id')

# Rename columns to standardize naming convention
data_df = cl.rename_col_xy(data_df)

# Clean text column 'reason' for consistency
cl.clean_text_column(data_df, "reason")

# Recheck the merged data
explo.check(data_df)

# Remove rows with missing values in the 'cash_request_id' column
data_df = cl.remove_nan(data_df, "cash_request_id")

# Verify there are no null values left
explo.check_null(data_df)

# Drop unnecessary columns: 'id_fee' and 'category'
data_df = cl.drop_col(data_df, ["id_fee", "category"])

# Recheck the data after dropping columns
explo.check_null(data_df)
