In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-credit-project/prev_approved_cash_loans.csv
/kaggle/input/home-credit-project/non_redundant_instal_payments.csv
/kaggle/input/home-credit-project/curr_cash_loans.csv
/kaggle/input/pos-cash-balance/POS_CASH_balance.csv
/kaggle/input/instal-data/non_redundant_instal.csv


In [4]:
from pathlib import Path
cash_loans_dir = Path('/kaggle/input/home-credit-project')
instal_dir = Path('/kaggle/input/instal-data')
output_dir = Path('/kaggle/working')
pos_dir = Path('/kaggle/input/pos-cash-balance')

In [5]:
curr_cash_loans = pd.read_csv(cash_loans_dir / 'curr_cash_loans.csv')
prev_cash_loans = pd.read_csv(cash_loans_dir / 'prev_approved_cash_loans.csv')
instal_history = pd.read_csv(instal_dir / 'non_redundant_instal.csv')
pos_cash_bal = pd.read_csv(pos_dir / 'POS_CASH_balance.csv')
instal_history.drop(columns=['total_payment_per_instal'], axis=1, inplace=True)

### Data Preparation for Acquiring Instalment Features
1. Filter pos_cash_bal for loans which were marked "completed".
2. Join the filtered pos_cash_bal data from Step 1 with prev_cash_loans.
3. Join the resulting data from Step 2 with curr_cash_loans.
4. Finally join with Instalment data.
5. Filter for instalment information which was at most 5 years old.
6. Also rename selected columns for readability.
7. "_BU" stands for back-up.

In [6]:
# Get Completed loans from previous applications
selected_cols = ['SK_ID_PREV', 'NAME_CONTRACT_STATUS', 'CNT_INSTALMENT']
pos_cash_bal_BU = pos_cash_bal.copy()[selected_cols]

new_col_names = {'SK_ID_PREV': 'PREV_LOAN_ID', 
                 'NAME_CONTRACT_STATUS': 'PREV_LOAN_STATUS', 
                 'CNT_INSTALMENT': 'TOTAL_NUM_OF_PAYMENTS'}

pos_cash_bal_BU.rename(
    new_col_names, inplace=True, axis=1)

IS_COMPLETED = pos_cash_bal_BU.PREV_LOAN_STATUS == 'Completed'
completed_prev_loans = pos_cash_bal_BU[IS_COMPLETED]

In [7]:
completed_prev_loans.shape #~744K of completed loans from previous applications

(744883, 3)

In [8]:
# Get Completed cash loans for previous application
selected_cols = ['SK_ID_PREV', 'SK_ID_CURR', 
                 'NAME_YIELD_GROUP', 'AMT_ANNUITY', 
                 'AMT_CREDIT']
prev_cash_loans_BU = prev_cash_loans.copy()[selected_cols]

new_col_names = {'SK_ID_PREV': 'PREV_LOAN_ID', 
                 'SK_ID_CURR': 'CURR_CUSTOMER_ID', 
                 'AMT_ANNUITY': 'PREV_MONTHLY_ANNUITY',
                'AMT_CREDIT': 'PREV_PRINCIPAL_AMT'}

prev_cash_loans_BU.rename(
    new_col_names, inplace=True, axis=1)

completed_cash_loans = completed_prev_loans.merge(
    prev_cash_loans_BU, how='inner', on=['PREV_LOAN_ID']
)

In [9]:
completed_cash_loans.shape #~210K of completed cash loans from previous applications

(209997, 7)

In [10]:
# Get current borrowers with completed cash loans from previous applications
curr_cash_loans_BU = curr_cash_loans.copy()

new_col_names = {'SK_ID_CURR': 'CURR_CUSTOMER_ID',
                 'AMT_ANNUITY': 'CURR_MONTHLY_ANNUITY', 
                 'AMT_CREDIT': 'CURR_PRINCIPAL_AMT'}

curr_cash_loans_BU.rename(
    new_col_names, axis=1, inplace=True)

curr_apps_w_completed_history = curr_cash_loans_BU.merge(
    completed_cash_loans, how='inner', on=['CURR_CUSTOMER_ID'])

In [11]:
#~167K of curr applicants with completed cash loans from previous applications
curr_apps_w_completed_history.shape

(167651, 28)

In [12]:
#Get instalment history for completed cash loans from previous applications for current applicants
selected_cols = ['SK_ID_PREV','SK_ID_CURR','NUM_INSTALMENT_NUMBER', 
                'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
                'AMT_INSTALMENT', 'AMT_PAYMENT']

instal_history_BU = instal_history.copy()[selected_cols]

new_col_names = {'SK_ID_CURR': 'CURR_CUSTOMER_ID',
                 'SK_ID_PREV': 'PREV_LOAN_ID',
                 'DAYS_INSTALMENT': 'PREV_INSTALMENT_DUE_DAY',
                 'DAYS_ENTRY_PAYMENT': 'PREV_INSTALMENT_PAID_DAY',
                 'AMT_INSTALMENT': 'PREV_MONTHLY_INSTALMENT_AMT',
                 'AMT_PAYMENT': 'PREV_MONTHLY_REPAID_AMT'}

instal_history_BU.rename(
    new_col_names, axis=1, inplace=True)

instal_history_for_completed_loans = curr_apps_w_completed_history.merge(
    instal_history_BU, how='inner', on=['PREV_LOAN_ID', 'CURR_CUSTOMER_ID']
)

In [13]:
#~1.7 million instalment histories for completed cash loans from previous applications
#for current applicants
instal_history_for_completed_loans.shape

(1771965, 33)

In [14]:
#Filter for instalment histories which were at most 5 years old
IS_AT_MOST_5_YEARS = np.abs(instal_history_for_completed_loans.PREV_INSTALMENT_DUE_DAY) <= 1825
five_yr_instal = instal_history_for_completed_loans[IS_AT_MOST_5_YEARS]

round(
    len(five_yr_instal.PREV_LOAN_ID.unique()) 
    / len(instal_history_for_completed_loans.PREV_LOAN_ID.unique()), 2) # preserves ~93% of previous loan history.

0.93

In [15]:
five_yr_instal.shape

(1505850, 33)

In [16]:
five_yr_instal.columns

Index(['CURR_CUSTOMER_ID', 'TARGET', 'CODE_GENDER', 'DAYS_ID_PUBLISH',
       'CURR_PRINCIPAL_AMT', 'CURR_MONTHLY_ANNUITY', 'NAME_CONTRACT_TYPE',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'DAYS_EMPLOYED',
       'FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE', 'NAME_EDUCATION_TYPE',
       'LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'FLAG_EMAIL', 'FLAG_EMP_PHONE', 'FLAG_MOBIL',
       'FLAG_CONT_MOBILE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'PREV_LOAN_ID', 'PREV_LOAN_STATUS',
       'TOTAL_NUM_OF_PAYMENTS', 'NAME_YIELD_GROUP', 'PREV_MONTHLY_ANNUITY',
       'PREV_PRINCIPAL_AMT', 'NUM_INSTALMENT_NUMBER',
       'PREV_INSTALMENT_DUE_DAY', 'PREV_INSTALMENT_PAID_DAY',
       'PREV_MONTHLY_INSTALMENT_AMT', 'PREV_MONTHLY_REPAID_AMT'],
      dtype='object')

### Sanity check:
In **"Swap AMT_INSTALMENT and AMT_PAYMENT for Installments Payment"** notebook,<br>
I assumed **PREV_MONTHLY_REPAID_AMT (previously known as AMT_PAYMENT)** could not exceed **PREV_MONTHLY_INSTALMENT_AMT (previously known as AMT_INSTALMENT)** and I swapped values for these 2 columns. <br>

Moreover, I listed out 2 possible cases when **Previous Monthly Repaid Amount > Previous Monthly Instalment**:
1. A data entry error.
2. A possible prepayment. <br>

At this juncture, I check for possible prepayments which fulfill the following:
1. **PREV_MONTHLY_INSTALMENT_AMT > PREV_MONTHLY_ANNUITY**.
2. **MAX_INSTAL_NUM < TOTAL_NUM_OF_PAYMENTS**.
3. **NUM_INSTALMENT_NUMBER <= MAX_INSTAL_NUM**.

If the above inequality is true, I swap the values for **PREV_MONTHLY_REPAID_AMT** and **PREV_MONTHLY_INTSTALMENT_AMT** columns.


In [17]:
max_instal_num = five_yr_instal.groupby(['CURR_CUSTOMER_ID', 'PREV_LOAN_ID'])['NUM_INSTALMENT_NUMBER'].max().reset_index()

max_instal_num.rename({'NUM_INSTALMENT_NUMBER': 'MAX_INSTAL_NUM'}, axis=1, inplace=True)

five_yr_instal = five_yr_instal.merge(
    max_instal_num, how='inner', on=['CURR_CUSTOMER_ID', 'PREV_LOAN_ID'])

In [18]:
"""
    Monthly instalment technically cannot exceed monthly annuity,but monthly repayment can (see cond 1)
    when the following conditions are true:
    1. Actual number of instalment payments < theoretical number of instalment payments (see cond 2).
    2. The ith instalment technically can incur a prepayment: monthly repayment > monthly instalment (see cond 3).

"""

cond_1 = (five_yr_instal.PREV_MONTHLY_INSTALMENT_AMT > five_yr_instal.PREV_MONTHLY_ANNUITY) 

cond_2 = (five_yr_instal.MAX_INSTAL_NUM < five_yr_instal.TOTAL_NUM_OF_PAYMENTS) #prepayment

cond_3 = (five_yr_instal.NUM_INSTALMENT_NUMBER <= five_yr_instal.MAX_INSTAL_NUM)

to_swap = (cond_1) & (cond_2) & (cond_3)

In [20]:
five_yr_instal.loc[to_swap, ['PREV_MONTHLY_INSTALMENT_AMT', 'PREV_MONTHLY_REPAID_AMT']] = (
    five_yr_instal.loc[to_swap, ['PREV_MONTHLY_REPAID_AMT', 'PREV_MONTHLY_INSTALMENT_AMT']].values)

### Basic Checks
1. Count number of Current Defaults/Non-Defaults.
2. Check No negative values for Principal, Annuity, Repayment Amounts. 

In [22]:
selected_cols = ['CURR_CUSTOMER_ID', 'TARGET']
sense_check = five_yr_instal.copy()[selected_cols]

sense_check.drop_duplicates(
    inplace=True)
sense_check.shape #number of unique current applicants

(89336, 2)

**Note**: <br> 
I can only perform analysis for current customers with 3 year instalment histories as I filtered for instalment histories which were at most 3 years.

In [23]:
curr_customers_w_5_year_loan_hist = sense_check.CURR_CUSTOMER_ID.unique()
all_customers_w_completed_cash_loans = instal_history_for_completed_loans.CURR_CUSTOMER_ID.unique()

print(f'number of current customers with 5 year instalment histories: \
      {len(curr_customers_w_5_year_loan_hist)}')
      
print(f'number of all current customers with cash completed cash loans: \
      {len(all_customers_w_completed_cash_loans)}')

number of current customers with 5 year instalment histories:       89336
number of all current customers with cash completed cash loans:       91905


In [24]:
pct_default = (sense_check['TARGET'].sum() / len(sense_check['TARGET']))* 100
print(f'Percentage of default: {round(pct_default, 2)}%')

Percentage of default: 8.25%


In [25]:
numeric_cols = ['CURR_PRINCIPAL_AMT', 'CURR_MONTHLY_ANNUITY', 'TOTAL_NUM_OF_PAYMENTS',
               'PREV_MONTHLY_ANNUITY', 'PREV_PRINCIPAL_AMT', 'NUM_INSTALMENT_NUMBER',
               'PREV_MONTHLY_INSTALMENT_AMT', 'PREV_MONTHLY_REPAID_AMT']

five_yr_instal[numeric_cols].describe()

Unnamed: 0,CURR_PRINCIPAL_AMT,CURR_MONTHLY_ANNUITY,TOTAL_NUM_OF_PAYMENTS,PREV_MONTHLY_ANNUITY,PREV_PRINCIPAL_AMT,NUM_INSTALMENT_NUMBER,PREV_MONTHLY_INSTALMENT_AMT,PREV_MONTHLY_REPAID_AMT
count,1505850.0,1505789.0,1505850.0,1505850.0,1505850.0,1505850.0,1505850.0,1505844.0
mean,622692.3,28308.79,15.99891,24080.3,377376.7,8.378943,27563.87,23594.09
std,402337.3,14423.8,8.457464,16820.92,331750.5,6.925293,50253.02,38174.09
min,45000.0,1980.0,1.0,1784.7,45000.0,1.0,1784.7,0.0
25%,284400.0,17784.0,11.0,12596.53,152820.0,3.0,12646.03,10849.86
50%,539590.5,26086.5,13.0,19404.58,269550.0,7.0,19587.78,17803.03
75%,846000.0,36094.5,21.0,29815.15,491580.0,11.0,30152.79,28593.9
max,3150000.0,216589.5,60.0,298557.6,4050000.0,60.0,3202062.0,3202062.0


In [43]:
test_df = five_yr_instal.copy()[
    ['CURR_CUSTOMER_ID', 'PREV_LOAN_ID', 'NUM_INSTALMENT_NUMBER', 'PREV_MONTHLY_INSTALMENT_AMT', 'PREV_MONTHLY_REPAID_AMT']
]
test_df['PREV_MONTHLY_INSTALMENT_AMT'] = np.floor(test_df['PREV_MONTHLY_INSTALMENT_AMT'])
test_df['PREV_MONTHLY_REPAID_AMT'] = np.floor(test_df['PREV_MONTHLY_REPAID_AMT'])

test_df = test_df.groupby(
    ['CURR_CUSTOMER_ID', 'PREV_LOAN_ID', 
     'NUM_INSTALMENT_NUMBER', 'PREV_MONTHLY_INSTALMENT_AMT'])['PREV_MONTHLY_REPAID_AMT'].sum().reset_index()

test_df.rename(
    {'PREV_MONTHLY_REPAID_AMT': 'TOTAL_REPAID_PER_INSTAL'}, 
axis=1, inplace=True)

test_df[(test_df.TOTAL_REPAID_PER_INSTAL - test_df.PREV_MONTHLY_INSTALMENT_AMT) > 0].shape

test_df_2 = test_df.merge(sense_check, how='inner', on=['CURR_CUSTOMER_ID'])

## Compute the following metrics:
1. Sum of Days Past Due for instalments which were at most 5 years old.
2. Sum of Money Owed for instalments which were at most 5 years old.
3. Number of Late Payment streaks for instalments which were at most 5 years old.
4. Number of Consecutive Partial Payment streaks for instalments which were at most 5 years old.
5. Compute loan interest rates.
Note: all these features  are only relevant for curr loan ids in five_yr_instal df

### Total Days Past Due for instalments from previous loans
**PREV_INSTALMENT_PAID_DAY**: The actual day which a borrower paid his or her instalment. <br>
For eg: if **PREV_INSTALMENT_PAID_DAY** is -10, a borrower repaid his or her instalment 10 days prior to current loan application.

**PREV_INSTALMENT_DUE_DAY**: The deadline to pay an instalment. <br>
For eg: if **PREV_INSTALMENT_DUE_DAY** is -20, a borrower needs to repay his or her instalment 20 days prior to current loan application. <br>

In our example, **PREV_INSTALMENT_PAID_DAY** occurred after **PREV_INSTALMENT_DUE_DAY**, the borrower was late in his or her repayment.


In [95]:
five_yr_instal_BU = five_yr_instal.copy()
five_yr_instal_BU['DAYS_PAST_DUE'] = five_yr_instal_BU['PREV_INSTALMENT_PAID_DAY'] \
                                        - five_yr_instal_BU['PREV_INSTALMENT_DUE_DAY']

sum_dpd_by_previous_loans = five_yr_instal_BU.groupby(
    ['CURR_CUSTOMER_ID', 'PREV_LOAN_ID'])['DAYS_PAST_DUE'].sum().reset_index()

In [70]:
sum_dpd_by_curr_customer = sum_dpd_by_previous_loans.groupby(
                                ['CURR_CUSTOMER_ID'])['DAYS_PAST_DUE'].sum().reset_index()

#ensure all curr customer IDs are preserved.
set(sum_dpd_by_curr_customer.CURR_CUSTOMER_ID.unique()) == set(sense_check.CURR_CUSTOMER_ID.unique())

True

In [71]:
#rename and save
new_names = {'DAYS_PAST_DUE': 'TOTAL_PAST_DPD'} #Where DPD == DAYS_PAST_DUE
sum_dpd_by_curr_customer.rename(
    new_names, axis=1, inplace=True)

sum_dpd_by_curr_customer.to_csv(
    output_dir / 'total_prev_dpd_by_customer.csv', index=False)

### Sum of Money Owed
In installments dataset, each instalment payment can be split into more than 1 rows. <br>
To compute total amount of money owed, I do the following:
1. Compute **Total Repaid Per Instalment** via a groupby, store this as a table.
2. Join the table from step 1 with table containing previously approved cash loans.
3. To determine amount of money owed to Home Credit, compute **PREV_MONTHLY_INSTALMENT_AMT - total payment per instalment**. Each Instalment and Amount Annuity is monthly.

In [73]:
five_yr_instal_BU = five_yr_instal.copy()

total_repaid_per_instal = five_yr_instal_BU.groupby(
    ['CURR_CUSTOMER_ID', 'PREV_LOAN_ID', 
     'NUM_INSTALMENT_NUMBER', 'PREV_MONTHLY_INSTALMENT_AMT'])['PREV_MONTHLY_REPAID_AMT'].sum().reset_index()

total_repaid_per_instal.rename(
    {'PREV_MONTHLY_REPAID_AMT': 'TOTAL_REPAID_PER_INSTAL'}, 
axis=1, inplace=True)

In [94]:
total_repaid_per_instal['PREV_MONTHLY_INSTALMENT_AMT'] = round(
    total_repaid_per_instal['PREV_MONTHLY_INSTALMENT_AMT'])

total_repaid_per_instal['TOTAL_REPAID_PER_INSTAL'] = round(
    total_repaid_per_instal['TOTAL_REPAID_PER_INSTAL'])

In [77]:
total_repaid_per_instal['MONEY_OWED_PER_INSTAL'] = (
        total_repaid_per_instal['PREV_MONTHLY_INSTALMENT_AMT'] - 
            total_repaid_per_instal['TOTAL_REPAID_PER_INSTAL'])

In [93]:
total_repaid_per_prev_loan = total_repaid_per_instal.groupby(
    ['CURR_CUSTOMER_ID', 'PREV_LOAN_ID'])['MONEY_OWED_PER_INSTAL'].sum().reset_index()

total_repaid_per_curr_client = total_repaid_per_prev_loan.groupby(
    ['CURR_CUSTOMER_ID'])['MONEY_OWED_PER_INSTAL'].sum().reset_index()

Unnamed: 0,CURR_CUSTOMER_ID,MONEY_OWED_PER_INSTAL
6,100025,-23860.0
10,100039,-172486.0
20,100060,-77823.0
21,100061,-225064.0
30,100101,-11048.0
...,...,...
89300,456151,-877440.0
89314,456200,-10530.0
89315,456201,-35546.0
89326,456227,-38805.0


### Compute Frequency of Late Payments
1. A payment can be 1 late payment or a streak of consecutive late payments. See [source](https://joshdevlin.com/blog/calculate-streaks-in-pandas/) and see **compute_total_late_payment** function.
2. Create a new column called is_late.
3. Based on our formula for Days Past Due, is_late if True is Days Past Due is Positive.

In [188]:
def compute_total_late_payment(instal_info:pd.DataFrame) -> pd.DataFrame:
    """
        Computes number of late payments (can be just 1 late payment or a streak of late payments)
        for just 1 previous loan as denoted by SK_ID_PREV.
        
        Parameters:
        -----------
            instal_info (pd.DataFrame): 
                A dataframe of instalment history for a current borrower for 1 previous loan.
                instal_info must contain the following information:
                    1. SK_ID_CURR.
                    2. SK_ID_PREV.
                    3. IS_LATE.
        
        Returns:
       ----------
       A DataFrame containing 3 pieces of information:
       1. Cumulative count of streaks of late payments for a previous loan.
       2. SK_ID_CURR.
       3. SK_ID_PREV.
       
    """
    instal_info['grouper'] = (instal_info.IS_LATE != instal_info.IS_LATE.shift()).cumsum()
    subset = instal_info[['grouper', 'IS_LATE']]
    
    subset['streaks_count'] = subset.groupby('grouper').cumsum()
    subset['start_of_streak'] = (subset['streaks_count'] == 1) | (subset['streaks_count'] == 0)
    subset['end_of_streak'] = (subset['streaks_count'].shift(-1) == 1) | (subset['streaks_count'].shift(-1) == 0)| (subset['streaks_count'].shift(-1).isnull())
    
    total_late_payments = subset[subset['end_of_streak']]['streaks_count'].sum()
    
    sk_id_curr = instal_info['SK_ID_CURR'].unique()[0]
    sk_id_prev = instal_info['SK_ID_PREV'].unique()[0]
    
    new_data = {'SK_ID_CURR': [sk_id_curr], 
                'SK_ID_PREV': [sk_id_prev], 
                'total_late_payment': [total_late_payments]}
    
    return pd.DataFrame.from_dict(new_data)

In [173]:
cols_to_show = ['SK_ID_CURR', 'SK_ID_PREV', 'DAYS_PAST_DUE']
instal_copy = three_yr_instal_1.copy()[cols_to_show]
instal_copy['IS_LATE'] = instal_copy['DAYS_PAST_DUE'] > 0
instal_copy.head()

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,DAYS_PAST_DUE,IS_LATE
0,100003,1810518,-3.0,False
1,100003,1810518,-4.0,False
2,100003,1810518,-5.0,False
3,100003,1810518,-3.0,False
4,100003,1810518,-4.0,False


In [189]:
"""

total_late_payments = []

count = 0
for sk_id_prev in instal_copy.SK_ID_PREV.unique():
    prev_loan_df = instal_copy[instal_copy.SK_ID_PREV == sk_id_prev]
    late_payment_df = compute_total_late_payment(prev_loan_df[['SK_ID_PREV', 'SK_ID_CURR', 'IS_LATE']])
    total_late_payments.append(late_payment_df)
    count += 1
    if count % 10_000 == 0:
        print(count)

late_payments_df = pd.concat(total_late_payments)
late_payments_df.head()
"""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


Unnamed: 0,SK_ID_CURR,SK_ID_PREV,total_late_payment
0,100003,1810518,0
0,100006,2078043,0
0,100007,2001242,0
0,100007,1692033,0
0,100007,1940724,2


In [193]:
late_payments_df.rename({'total_late_payment': 'total_late_payments_freq'}, axis=1, inplace=True)
late_payments_df.to_csv(output_dir / 'late_instal_payments_count.csv', index=False)
late_payments_df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,total_late_payments_freq
0,100003,1810518,0
0,100006,2078043,0
0,100007,2001242,0
0,100007,1692033,0
0,100007,1940724,2


### Compute Frequency of Partial Repayment of Instalments
1. Computation is similar to Frequency of Late Payments.
2. I only considered amount of money owed which exceed 100 dollars.

In [220]:
def compute_total_partial_payment(instal_info:pd.DataFrame) -> pd.DataFrame:
    """
        Computes number of partial instalment payments (can be just 1 partial payment or a streak of partial payments)
        for just 1 previous loan as denoted by SK_ID_PREV.
        
        Parameters:
        -----------
            instal_info (pd.DataFrame): 
                A dataframe of instalment history for a current borrower for 1 previous loan.
                instal_info must contain the following information:
                    1. SK_ID_CURR.
                    2. SK_ID_PREV.
                    3. IS_PARTIAL_REPAYMENT.
        
        Returns:
       ----------
       A DataFrame containing 3 pieces of information:
       1. Cumulative count of streaks of partial payments for a previous loan.
       2. SK_ID_CURR.
       3. SK_ID_PREV.
       
    """
    instal_info['grouper'] = (instal_info.IS_PARTIAL_REPAYMENT != instal_info.IS_PARTIAL_REPAYMENT.shift()).cumsum()
    subset = instal_info[['grouper', 'IS_PARTIAL_REPAYMENT']]
    
    subset['streaks_count'] = subset.groupby('grouper').cumsum()
    subset['start_of_streak'] = (subset['streaks_count'] == 1) | (subset['streaks_count'] == 0)
    subset['end_of_streak'] = (subset['streaks_count'].shift(-1) == 1) | (subset['streaks_count'].shift(-1) == 0)| (subset['streaks_count'].shift(-1).isnull())
    
    total_late_payments = subset[subset['end_of_streak']]['streaks_count'].sum()
    
    sk_id_curr = instal_info['SK_ID_CURR'].unique()[0]
    sk_id_prev = instal_info['SK_ID_PREV'].unique()[0]
    
    new_data = {'SK_ID_CURR': [sk_id_curr], 
                'SK_ID_PREV': [sk_id_prev], 
                'total_partial_payment': [total_late_payments]}
    
    return pd.DataFrame.from_dict(new_data)

In [206]:
cols_to_keep = ['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER', 'MONEY_OWED']
partial_repay = instal_and_prev_apps.copy()[cols_to_keep]
partial_repay['IS_PARTIAL_REPAYMENT'] = partial_repay.MONEY_OWED > 100

In [225]:
total_partial_payments = []

count = 0
for sk_id_prev in partial_repay.SK_ID_PREV.unique():
    repay_df = partial_repay[partial_repay.SK_ID_PREV == sk_id_prev]
    partial_payment_df = compute_total_partial_payment(repay_df[['SK_ID_PREV', 'SK_ID_CURR', 'IS_PARTIAL_REPAYMENT']])
    total_partial_payments.append(partial_payment_df)
    count += 1
    if count % 10_000 == 0:
        print(count)

partial_payments_df = pd.concat(total_partial_payments)
partial_payments_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


Unnamed: 0,SK_ID_CURR,SK_ID_PREV,total_partial_payment
0,198678,1000011,0
0,277601,1000027,0
0,198771,1000050,0
0,217553,1000067,0
0,330786,1000082,0


In [234]:
partial_payments_df.to_csv(output_dir/'partial_payments_count.csv', index=False)