In [7]:
import os
import glob
import pandas as pd
# Get all CSV files from the raw directory
raw_data_path = "../data/raw"
csv_files = glob.glob(os.path.join(raw_data_path, "*.csv"))

# Create a list of DataFrames, each containing a single CSV
dataframes = []
file_names = []

for csv_file in csv_files:
    # Extract filename without extension for reference
    file_name = os.path.basename(csv_file).replace('.csv', '')
    
    # Load CSV into DataFrame
    df = pd.read_csv(csv_file)
    
    # Store DataFrame and filename
    dataframes.append(df)
    file_names.append(file_name)
    
    print(f"Loaded {file_name}: {df.shape[0]} rows, {df.shape[1]} columns")

print(f"\nTotal DataFrames created: {len(dataframes)}")


Loaded accounts: 1667 rows, 6 columns
Loaded account_statuses: 3 rows, 2 columns
Loaded account_types: 5 rows, 2 columns
Loaded addresses: 1222 rows, 4 columns
Loaded branches: 50 rows, 3 columns
Loaded customers: 1111 rows, 6 columns
Loaded customer_types: 3 rows, 2 columns
Loaded loans: 333 rows, 7 columns
Loaded loan_statuses: 3 rows, 2 columns
Loaded transactions: 50000 rows, 8 columns
Loaded transaction_types: 4 rows, 2 columns

Total DataFrames created: 11


In [8]:
# Display basic information about each DataFrame
for i, (df, name) in enumerate(zip(dataframes, file_names)):
    print(f"\n=== {name.upper()} ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Data types:\n{df.dtypes}")
    print(f"First few rows:")
    print(df.head(3))
    print("-" * 50)



=== ACCOUNTS ===
Shape: (1667, 6)
Columns: ['AccountID', 'CustomerID', 'AccountTypeID', 'AccountStatusID', 'Balance', 'OpeningDate']
Data types:
AccountID            int64
CustomerID           int64
AccountTypeID        int64
AccountStatusID      int64
Balance            float64
OpeningDate         object
dtype: object
First few rows:
   AccountID  CustomerID  AccountTypeID  AccountStatusID   Balance  \
0     200094       10123              3                1  48348.54   
1     201108       10077              3                1  35001.41   
2     201453       10321              3                2  57081.03   

                  OpeningDate  
0  2018-06-12 00:00:00.000000  
1  2019-10-30 00:00:00.000000  
2  2020-05-24 00:00:00.000000  
--------------------------------------------------

=== ACCOUNT_STATUSES ===
Shape: (3, 2)
Columns: ['AccountStatusID', 'StatusName']
Data types:
AccountStatusID     int64
StatusName         object
dtype: object
First few rows:
   AccountStatusID Status

In [9]:
# How to access individual DataFrames

# Method 1: By index
transactions_df = dataframes[0]  # First DataFrame
print("Transactions DataFrame shape:", transactions_df.shape)

# Method 2: By name (find index first)
def get_dataframe_by_name(name):
    try:
        index = file_names.index(name)
        return dataframes[index]
    except ValueError:
        print(f"DataFrame '{name}' not found")
        return None

# Example usage
accounts_df = get_dataframe_by_name('accounts')
if accounts_df is not None:
    print("Accounts DataFrame shape:", accounts_df.shape)

# Method 3: Create a dictionary for easier access
df_dict = dict(zip(file_names, dataframes))
print(f"\nAvailable DataFrames: {list(df_dict.keys())}")

# Access using dictionary
customers_df = df_dict['customers']
print("Customers DataFrame shape:", customers_df.shape)


Transactions DataFrame shape: (1667, 6)
Accounts DataFrame shape: (1667, 6)

Available DataFrames: ['accounts', 'account_statuses', 'account_types', 'addresses', 'branches', 'customers', 'customer_types', 'loans', 'loan_statuses', 'transactions', 'transaction_types']
Customers DataFrame shape: (1111, 6)


In [15]:
transactions_df = df_dict['transactions']
print(transactions_df.head())
print(transactions_df.info())
print(transactions_df.describe())
print(transactions_df.isnull().sum())
print(transactions_df.duplicated().sum())
#print(transactions_df.corr())
#print(transactions_df.skew())
#print(transactions_df.kurtosis())
#print(transactions_df.mode())
#print(transactions_df.median())
#print(transactions_df.mean())

   TransactionID  AccountOriginID  AccountDestinationID  TransactionTypeID  \
0        3022681           201164                200868                  2   
1        3037846           200138                201402                  2   
2        3045293           201002                201180                  1   
3        3017397           201066                201144                  4   
4        3016750           200289                201413                  3   

    Amount             TransactionDate  BranchID        Description  
0   855.17  2023-04-20 02:00:00.000000        41  Transaction 22681  
1   806.20  2021-08-10 15:00:00.000000        43  Transaction 37846  
2  1229.44  2020-08-16 03:00:00.000000         5  Transaction 45293  
3  4441.60  2021-10-10 06:00:00.000000        14  Transaction 17397  
4  2526.20  2022-07-28 00:00:00.000000        37  Transaction 16750  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 # 

In [7]:
accounts_df = df_dict['accounts']
accounts_df.head()

Unnamed: 0,AccountID,CustomerID,AccountTypeID,AccountStatusID,Balance,OpeningDate
0,200094,10123,3,1,48348.54,2018-06-12 00:00:00.000000
1,201108,10077,3,1,35001.41,2019-10-30 00:00:00.000000
2,201453,10321,3,2,57081.03,2020-05-24 00:00:00.000000
3,200581,10871,5,1,63164.33,2021-01-27 00:00:00.000000
4,200003,10765,1,1,58739.64,2018-09-12 00:00:00.000000


In [8]:
customers_df = df_dict['customers']
customers_df.head()

Unnamed: 0,CustomerID,FirstName,LastName,DateOfBirth,AddressID,CustomerTypeID
0,10832,Nyla,Aguirre,1974-02-07 00:00:00.000000,881,1
1,10983,,Battle,1963-02-01 00:00:00.000000,958,2
2,10837,Angelena,Harrington,1964-03-25 00:00:00.000000,86,3
3,10107,Remona,Glass,1965-09-16 00:00:00.000000,595,1
4,10553,King,Becker,1966-02-20 00:00:00.000000,969,3


In [18]:
branches_df = df_dict['branches']
branches_df.head()

Unnamed: 0,BranchID,BranchName,AddressID
0,1,Branch 1,733
1,2,Branch 2,511
2,3,Branch 3,27
3,4,Branch 4,97
4,5,Branch 5,796


In [19]:
loans_df = df_dict['loans']
print(loans_df.head())
print(loans_df.info())
print(loans_df.describe())
print(loans_df.isnull().sum())
print(loans_df.duplicated().sum())


   LoanID  AccountID  LoanStatusID  PrincipalAmount  InterestRate  \
0  400230     200876             1         76958.56        0.0547   
1  400307     200789             1         29013.67        0.0321   
2  400233     201275             1         48596.76        0.1017   
3  400100     200070             1          9191.43        0.0999   
4  400141     200808             1         76322.83        0.0906   

                    StartDate            EstimatedEndDate  
0  2022-11-20 00:00:00.000000  2026-08-06 00:00:00.000000  
1  2022-02-22 00:00:00.000000  2025-12-08 00:00:00.000000  
2  2021-11-21 00:00:00.000000  2023-07-30 00:00:00.000000  
3  2021-08-14 00:00:00.000000  2023-09-18 00:00:00.000000  
4  2021-06-04 00:00:00.000000  2024-10-23 00:00:00.000000  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   LoanID            333 