In [1]:
import pandas as pd
from data_transform import DataTransform as dt

In [2]:
def load_csv_to_dataframe():
    """Load the loan_payments.csv file and return the resultant dataframe.

    Returns:
        df (pd.Dataframe): dataframe contianing loan_payments data
    """
    
    df = pd.read_csv('../loan_payments.csv')
    return df

df = load_csv_to_dataframe()

In [3]:
df.head()

Unnamed: 0,id,member_id,loan_amount,funded_amount,funded_amount_inv,term,int_rate,instalment,grade,sub_grade,...,recoveries,collection_recovery_fee,last_payment_date,last_payment_amount,next_payment_date,last_credit_pull_date,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type
0,38676116,41461848,8000,8000.0,8000.0,36 months,7.49,248.82,A,A4,...,0.0,0.0,Jan-2022,248.82,Feb-2022,Jan-2022,0.0,5.0,1,INDIVIDUAL
1,38656203,41440010,13200,13200.0,13200.0,36 months,6.99,407.52,A,A3,...,0.0,0.0,Jan-2022,407.52,Feb-2022,Jan-2022,0.0,,1,INDIVIDUAL
2,38656154,41439961,16000,16000.0,16000.0,36 months,7.49,497.63,A,A4,...,0.0,0.0,Oct-2021,12850.16,,Oct-2021,0.0,,1,INDIVIDUAL
3,38656128,41439934,15000,15000.0,15000.0,36 months,14.31,514.93,C,C4,...,0.0,0.0,Jun-2021,13899.67,,Jun-2021,0.0,,1,INDIVIDUAL
4,38656121,41439927,15000,15000.0,15000.0,36 months,6.03,456.54,A,A1,...,0.0,0.0,Jan-2022,456.54,Feb-2022,Jan-2022,0.0,,1,INDIVIDUAL


In [4]:
df.shape

(54231, 43)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54231 entries, 0 to 54230
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           54231 non-null  int64  
 1   member_id                    54231 non-null  int64  
 2   loan_amount                  54231 non-null  int64  
 3   funded_amount                51224 non-null  float64
 4   funded_amount_inv            54231 non-null  float64
 5   term                         49459 non-null  object 
 6   int_rate                     49062 non-null  float64
 7   instalment                   54231 non-null  float64
 8   grade                        54231 non-null  object 
 9   sub_grade                    54231 non-null  object 
 10  employment_length            52113 non-null  object 
 11  home_ownership               54231 non-null  object 
 12  annual_inc                   54231 non-null  float64
 13  verification_sta

## Correct the column formats 

#### Dates

In [6]:
# Column names which are currently objects, but need to be converted to dates 
obj_into_date_col_names = ['issue_date', 'last_payment_date', 'next_payment_date', 'last_credit_pull_date']

# Looking at the date data, all the columns follow this format 
current_format="%b-%Y"

# Get each object column set to be a date and apply the conversion
for col_name in obj_into_date_col_names:
    df[col_name] = dt.object_to_date(df[col_name], current_format)

#### Categories 

In [7]:
# Column names which are object types, but are to be converted to categorical types
obj_into_category_col_names = ['grade', 'sub_grade', 'employment_length', 'home_ownership', 'verification_status', 'loan_status', 'purpose', 'earliest_credit_line', 'application_type']

# Replace each of the objcet columns in the dataframe with it converted into a category type
for column_name in obj_into_category_col_names:
    df[column_name] = dt.object_to_categorical(df[column_name])

#### Integers 

In [8]:
# The following keys are the only two values. Could also simply be made categorical.
term_mapping = {'36 months': 36, '60 months': 60}

# Replace each key with its integer value based on the above map
df['term'] = dt.object_to_int(df['term'], term_mapping)

In [9]:
# The following keys are the only two values. Could also simply be made categorical.
pay_plan_mapping = {'y': 1, 'n': 0}

# Replace each key with its integer value based on the above map
df['payment_plan'] = dt.object_to_int(df['payment_plan'], pay_plan_mapping)

In [10]:
# These four columns all represent a number of months, and so can be cast to int64s
float64_columns_into_int64 = ['collections_12_mths_ex_med', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'mths_since_last_record']

for col_name in float64_columns_into_int64:
    df[col_name] = dt.float64_to_int64(df[col_name])