In [1]:
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

In [2]:
warnings.filterwarnings('ignore')

In [3]:
sample = pd.read_csv('SampleSubmission.csv')
sample

Unnamed: 0,customerid,Good_Bad_flag
0,8a28afc7474813a40147639ec637156b,1
1,8a3735d5518aba7301518ac34413010d,1
2,8a76e7d443e6e97c0143ed099d102b1d,1
3,8a818823525dceef01525deda2480384,1
4,8a818926522ea5ef01523aff15c37482,1
...,...,...
1445,8a858fff5a36fe68015a3744f0021e89,1
1446,8aaae7a74400b28201441c8b62514150,1
1447,8aab10f748cf78ff0148d11fac1447a6,1
1448,8aab160f499477da014999ba2f0f578b,1


In [5]:
demographic = pd.read_csv('traindemographics.csv')
demographic.head()

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10 00:00:00.000000,Savings,3.319219,6.528604,GT Bank,,,
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21 00:00:00.000000,Savings,3.325598,7.119403,Sterling Bank,,Permanent,
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01 00:00:00.000000,Savings,5.7461,5.563174,Fidelity Bank,,,
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19 00:00:00.000000,Savings,3.36285,6.642485,GT Bank,,Permanent,
4,8a858e785acd3412015acd48f4920d04,1982-11-22 00:00:00.000000,Savings,8.455332,11.97141,GT Bank,,Permanent,


Description of demographic data:
- customerid (Primary key used to merge to other data)
- birthdate (date of birth of the customer)
- bank_account_type (type of primary bank account)
- longitude_gps
- latitude_gps
- bank_name_clients (name of the bank)
- bank_branch_clients (location of the branch - not compulsory - so missing in a lot of the cases)
- employment_status_clients (type of employment that customer has)
- level_of_education_clients (highest level of education)


In [6]:
performance = pd.read_csv('trainperf.csv')
performance.head()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good
1,8a85886e54beabf90154c0a29ae757c0,301965204,2,2017-07-05 17:04:41.000000,2017-07-05 16:04:18.000000,15000.0,17250.0,30,,Good
2,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57.000000,2017-07-06 13:52:51.000000,20000.0,22250.0,15,,Good
3,8a85890754145ace015429211b513e16,301999343,3,2017-07-27 19:00:41.000000,2017-07-27 18:00:35.000000,10000.0,11500.0,15,,Good
4,8a858970548359cc0154883481981866,301962360,9,2017-07-03 23:42:45.000000,2017-07-03 22:42:39.000000,40000.0,44000.0,30,,Good


Performance data (trainperf.csv) : This is the repeat loan that the customer has taken for which we need to predict the performance of. Basically, we need to predict whether this loan would default given all previous loans and demographics of a customer.
Description of data:
- customerid (Primary key used to merge to other data)
- systemloanid (The id associated with the particular loan. The same customerId can have multiple systemloanid’s for each loan he/she has taken out)
- loannumber (The number of the loan that you have to predict)
- approveddate (Date that loan was approved)
- creationdate (Date that loan application was created)
- loanamount (Loan value taken)
- totaldue (Total repayment required to settle the loan - this is the capital loan value disbursed +interest and fees)
- termdays (Term of loan)
- referredby (customerId of the customer that referred this person - is missing, then not referred)
- good_bad_flag (good = settled loan on time; bad = did not settled loan on time) - this is the target variable that we need to predict

In [7]:
previous = pd.read_csv('trainprevloans.csv')
previous.head()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,2016-08-15 18:22:40.000000,2016-08-15 17:22:32.000000,10000.0,13000.0,30,2016-09-01 16:06:48.000000,,2016-09-14 00:00:00.000000,2016-09-01 15:51:43.000000
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,2017-04-28 18:39:07.000000,2017-04-28 17:38:53.000000,10000.0,13000.0,30,2017-05-28 14:44:49.000000,,2017-05-30 00:00:00.000000,2017-05-26 00:00:00.000000
2,8a2a81a74ce8c05d014cfb32a0da1049,301831714,8,2017-03-05 10:56:25.000000,2017-03-05 09:56:19.000000,20000.0,23800.0,30,2017-04-26 22:18:56.000000,,2017-04-04 00:00:00.000000,2017-04-26 22:03:47.000000
3,8a8588f35438fe12015444567666018e,301861541,5,2017-04-09 18:25:55.000000,2017-04-09 17:25:42.000000,10000.0,11500.0,15,2017-04-24 01:35:52.000000,,2017-04-24 00:00:00.000000,2017-04-24 00:48:43.000000
4,8a85890754145ace015429211b513e16,301941754,2,2017-06-17 09:29:57.000000,2017-06-17 08:29:50.000000,10000.0,11500.0,15,2017-07-14 21:18:43.000000,,2017-07-03 00:00:00.000000,2017-07-14 21:08:35.000000


Previous loans data (trainprevloans.csv) : This dataset contains all previous loans that the customer had prior to the loan above that we want to predict the performance of. Each loan will have a different systemloanid, but the same customerid for each customer.
Description of data:
- customerid (Primary key used to merge to other data)
- systemloanid (The id associated with the particular loan. The same customerId can have multiple systemloanid’s for each loan he/she has taken out)
- loannumber (The number of the loan that you have to predict)
- approveddate (Date that loan was approved)
- creationdate (Date that loan application was created)
- loanamount (Date that loan application was created)
- totaldue (Total repayment required to settle the loan - this is the capital loan value disbursed +interest and fees) termdays (Term of loan)
- closeddate (Date that the loan was settled)
- referredby (customerId of the customer that referred this person - is missing, then not refrerred)
- firstduedate (Date of first payment due in cases where the term is longer than 30 days. So in the case where the term is 60+ days - then there are multiple monthly payments due - and this dates reflects the date of the first payment)
- firstrepaiddate (Actual date that he/she paid the first payment as defined above)

In [8]:
customerCheck = demographic.assign(InPerformance=demographic.customerid.isin(performance.customerid), InPrevious=demographic.customerid.isin(previous.customerid))
customerCheck = customerCheck[['customerid','InPerformance','InPrevious']]

In [9]:
demographic.assign(InPerformance=demographic.customerid.isin(performance.customerid), InPrevious=demographic.customerid.isin(previous.customerid))


Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients,InPerformance,InPrevious
0,8a858e135cb22031015cbafc76964ebd,1973-10-10 00:00:00.000000,Savings,3.319219,6.528604,GT Bank,,,,True,True
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21 00:00:00.000000,Savings,3.325598,7.119403,Sterling Bank,,Permanent,,True,True
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01 00:00:00.000000,Savings,5.746100,5.563174,Fidelity Bank,,,,True,True
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19 00:00:00.000000,Savings,3.362850,6.642485,GT Bank,,Permanent,,True,True
4,8a858e785acd3412015acd48f4920d04,1982-11-22 00:00:00.000000,Savings,8.455332,11.971410,GT Bank,,Permanent,,False,False
...,...,...,...,...,...,...,...,...,...,...,...
4341,8a858f155554552501555588ca2b3b40,1985-12-13 00:00:00.000000,Other,3.236753,7.030168,Stanbic IBTC,,Permanent,Graduate,True,True
4342,8a858fc65cf978f4015cf97cee3a02ce,1982-07-01 00:00:00.000000,Savings,7.013750,4.875662,GT Bank,,,,True,True
4343,8a858f4f5b66de3a015b66fc83c61902,1989-09-26 00:00:00.000000,Savings,6.295530,7.092508,GT Bank,,Permanent,,False,False
4344,8aaae7a74400b28201441c8b62514150,1985-09-06 00:00:00.000000,Savings,3.354206,6.539070,GT Bank,HEAD OFFICE,Permanent,Primary,False,False


In [10]:
customerCheck

Unnamed: 0,customerid,InPerformance,InPrevious
0,8a858e135cb22031015cbafc76964ebd,True,True
1,8a858e275c7ea5ec015c82482d7c3996,True,True
2,8a858e5b5bd99460015bdc95cd485634,True,True
3,8a858efd5ca70688015cabd1f1e94b55,True,True
4,8a858e785acd3412015acd48f4920d04,False,False
...,...,...,...
4341,8a858f155554552501555588ca2b3b40,True,True
4342,8a858fc65cf978f4015cf97cee3a02ce,True,True
4343,8a858f4f5b66de3a015b66fc83c61902,False,False
4344,8aaae7a74400b28201441c8b62514150,False,False


In [11]:
customerCheck.groupby(['InPerformance','InPrevious']).value_counts()

AttributeError: 'DataFrameGroupBy' object has no attribute 'value_counts'

In [12]:
demographic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4346 entries, 0 to 4345
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4346 non-null   object 
 1   birthdate                   4346 non-null   object 
 2   bank_account_type           4346 non-null   object 
 3   longitude_gps               4346 non-null   float64
 4   latitude_gps                4346 non-null   float64
 5   bank_name_clients           4346 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   3698 non-null   object 
 8   level_of_education_clients  587 non-null    object 
dtypes: float64(2), object(7)
memory usage: 305.7+ KB


In [None]:
performance.info()

In [None]:
previous.info()

In [None]:
previous.head()

In [None]:
performance.head()

In [13]:
performance.loc[performance['customerid'] == '8a858e105bd92644015bd9db3a0f3be2']

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
2899,8a858e105bd92644015bd9db3a0f3be2,301981450,3,2017-07-17 09:50:07.000000,2017-07-17 08:50:00.000000,10000.0,13000.0,30,,Good


In [None]:
previous.loc[previous['customerid'] == '8a858e105bd92644015bd9db3a0f3be2']

In [None]:
performance.loc[performance['customerid'] == '8a858e105bd92644015bd9db3a0f3be2']

In [15]:
duplicate = demographic[demographic.duplicated()]
 
print("Duplicate Rows :")
 
# Print the resultant Dataframe
 duplicate

Duplicate Rows :


12

In [16]:
demographic.drop_duplicates(inplace=True)

In [17]:
customerCheck = demographic.assign(InPerformance=demographic.customerid.isin(performance.customerid), InPrevious=demographic.customerid.isin(previous.customerid))
customerCheck = customerCheck[['customerid','InPerformance','InPrevious']]

In [18]:
customerCheck.groupby(['InPerformance','InPrevious']).value_counts()


AttributeError: 'DataFrameGroupBy' object has no attribute 'value_counts'

Create the first model which would predict for non-new customers based on demographic and previous loans data.
Create the second model which predict for new customers based on just demographic data.

### Determining how the dataframes are going to be concatenated.

#### Checking the common columns between the data sets

##### customerid

Performance dataframe has 18183 entries and previous has 4368 entries.

In [19]:
performanceids = pd.DataFrame(performance['customerid'])
performanceids

Unnamed: 0,customerid
0,8a2a81a74ce8c05d014cfb32a0da1049
1,8a85886e54beabf90154c0a29ae757c0
2,8a8588f35438fe12015444567666018e
3,8a85890754145ace015429211b513e16
4,8a858970548359cc0154883481981866
...,...
4363,8a858e6d58b0cc520158beeb14b22a5a
4364,8a858ee85cf400f5015cf44ab1c42d5c
4365,8a858f365b2547f3015b284597147c94
4366,8a858f935ca09667015ca0ee3bc63f51


In [20]:
performanceids.groupby('customerid').value_counts().nlargest()

AttributeError: 'DataFrameGroupBy' object has no attribute 'value_counts'

In [21]:
previousids = pd.DataFrame(previous['customerid'])
previousids

Unnamed: 0,customerid
0,8a2a81a74ce8c05d014cfb32a0da1049
1,8a2a81a74ce8c05d014cfb32a0da1049
2,8a2a81a74ce8c05d014cfb32a0da1049
3,8a8588f35438fe12015444567666018e
4,8a85890754145ace015429211b513e16
...,...
18178,8a858899538ddb8e0153a2b555421fc5
18179,8a858899538ddb8e0153a2b555421fc5
18180,8a858899538ddb8e0153a2b555421fc5
18181,8a858f0656b7820c0156c92ca3ba436f


In [22]:
previousids.groupby('customerid').value_counts().nlargest()

AttributeError: 'DataFrameGroupBy' object has no attribute 'value_counts'

From the analysis above, one can see that the most times a customer appears in the performance data is once, whereas the most times a customer appears in the previous data is 26 times. This means there is a one to many relationship between the two dataframes with previous being on the many side. This means it is more ideal to add the 'performance' dataframe to the previous dataframe.

##### adding the performance dataframe to the previous loans dataframe

Between the two dataframes,there are a number of similar columns. To add the two dataframes together, the column names of the incoming dataframe,'performance', will be changed to be able to distinguish the columns.

In [23]:
performance.columns

Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'referredby',
       'good_bad_flag'],
      dtype='object')

In [24]:
previous.columns

Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'closeddate',
       'referredby', 'firstduedate', 'firstrepaiddate'],
      dtype='object')

From the cells above, we can see the similar columns: 
customerid, systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays and referred by. 
All of these columns,apart from customerid and systemloanid, will be renamed now.
Customerid and systemloanid will not be renamed because they are going to be dropped ahead.

In [25]:
performance.columns = ['customerid','systemloanid','perf_loannumber','perf_approveddate','perf_creationdate','perf_loanamount','perf_totaldue','perf_termdays','perf_referredby','good_bad_flag']

In [26]:
performance.columns

Index(['customerid', 'systemloanid', 'perf_loannumber', 'perf_approveddate',
       'perf_creationdate', 'perf_loanamount', 'perf_totaldue',
       'perf_termdays', 'perf_referredby', 'good_bad_flag'],
      dtype='object')

In [27]:
previous.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18183 entries, 0 to 18182
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customerid       18183 non-null  object 
 1   systemloanid     18183 non-null  int64  
 2   loannumber       18183 non-null  int64  
 3   approveddate     18183 non-null  object 
 4   creationdate     18183 non-null  object 
 5   loanamount       18183 non-null  float64
 6   totaldue         18183 non-null  float64
 7   termdays         18183 non-null  int64  
 8   closeddate       18183 non-null  object 
 9   referredby       1026 non-null   object 
 10  firstduedate     18183 non-null  object 
 11  firstrepaiddate  18183 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 1.7+ MB


In [28]:
previousCustomers = pd.merge(previous, performance, on='customerid')

## previousCustomers

In [29]:
previousCustomers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18183 entries, 0 to 18182
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customerid         18183 non-null  object 
 1   systemloanid_x     18183 non-null  int64  
 2   loannumber         18183 non-null  int64  
 3   approveddate       18183 non-null  object 
 4   creationdate       18183 non-null  object 
 5   loanamount         18183 non-null  float64
 6   totaldue           18183 non-null  float64
 7   termdays           18183 non-null  int64  
 8   closeddate         18183 non-null  object 
 9   referredby         1026 non-null   object 
 10  firstduedate       18183 non-null  object 
 11  firstrepaiddate    18183 non-null  object 
 12  systemloanid_y     18183 non-null  int64  
 13  perf_loannumber    18183 non-null  int64  
 14  perf_approveddate  18183 non-null  object 
 15  perf_creationdate  18183 non-null  object 
 16  perf_loanamount    181

Dropping the systemloanid columns

In [None]:
previousCustomers.drop(['systemloanid_x','systemloanid_y'],axis=1,inplace=True)
previousCustomers.info()

In [None]:
previousCustomers = pd.merge(previousCustomers, demographic, on='customerid')

##### adding demographic data to previous customers dataframe

In [None]:
previousCustomers.info()

##### looking at the data contained in the columns now to determine what columns will be used to train the model

*columns containing non-numerical data*

In [None]:
categorical = [var for var in previousCustomers.columns if previousCustomers[var].dtype=='O']

print('There are {} categorical variabes \n'.format(len(categorical)))

print('They are: ', categorical)

In [None]:
previousCustomers[categorical].head()

In [None]:
#check cardinality of the columns
for var in categorical:
    print(var, 'contains', len(previousCustomers[var].unique()), 'unique values')

approveddate (Date that loan was approved)
creationdate (Date that loan application was created)
loanamount (Date that loan application was created)
totaldue (Total repayment required to settle the loan - this is the capital loan value disbursed +interest and fees) termdays (Term of loan)
closeddate (Date that the loan was settled)
referredby (customerId of the customer that referred this person - is missing, then not refrerred)
firstduedate (Date of first payment due in cases where the term is longer than 30 days. So in the case where the term is 60+ days - then there are multiple monthly payments due - and this dates reflects the date of the first payment)
firstrepaiddate (Actual date that he/she paid the first payment as defined above)

##### customer id column: Primary key used to merge to other data

customer id column has an averagely high cardinality and is used to uniquely identify the customers. This feature will not be used for model training.

##### approved date column : Date that loan was approved

approved date column has a high cardinality. It will not be used because we don't believe the approval data for a loan holds a significance towards whether a loan may be bad or good.

##### creation date column : Date that loan application was created

creation date column has a high cardinality. It will not be selected as feature to train the model because we do not believe the creation date for a loan holds a significance towards whether a loan may be bad or good

##### closed date column : Date that the loan was settled

closed date column has a high cardinality. Since we have a loan creation date, we can arrive at how long the loan was active until it was settled. This time period can then be used as a new feature to train the model. Let's look more at the closed dates and see if there is a relationship  between the closing dates and whether a loan may be good or bad.

In [None]:
closedDates = previousCustomers[['closeddate','good_bad_flag']]
closedDates

 going to break down the date and individual components, ie year, month and day and time. Then proceed to gain insights into the relationship between date/time and bad/good loans

In [None]:
closedDates['closeddate'] = pd.to_datetime(closedDates['closeddate'])

In [None]:
closedDates.info()

In [None]:
closedDates['year'] = closedDates['closeddate'].dt.year
closedDates.head()

In [None]:
closedDates['month'] = closedDates['closeddate'].dt.month
closedDates.head()

In [None]:
closedDates['day'] = closedDates['closeddate'].dt.day
closedDates.head()

In [None]:
closedDates['hourOfDay'] = closedDates['closeddate'].dt.hour
closedDates.head()

In [None]:
closedDates['hourOfDay'].unique()

the hour is recorded in 24 hour clock which is desirable so we can differentiate between the times i.e 3 in the morning and 3 in the evening.

In [None]:
year_group = closedDates.groupby(['year'])['good_bad_flag'].value_counts()
year_group

In [None]:

#plt.subplot(4,1,1)
#yearLook = closedDates[['year', 'good_bad_flag']]
closedDates.groupby(['year','good_bad_flag']).size().unstack().plot(kind='bar',stacked=True)
plt.xlabel('Years')
plt.ylabel('Number of bad/good loans')

plt.title('A look at the number of good and bad loans through the years 2016 and 2017')

#plt.subplot(4,1,2)
#plt.title('A look at the number of good and bad loans throughout the years')

f = plt.gcf()
f.set_figwidth(5) 

In [None]:
closedDates.groupby(['month','good_bad_flag']).size().unstack().plot(kind='bar',stacked=True,rot=0)
plt.xlabel('Months of the year')
plt.ylabel('Number of bad/good loans')
plt.title('A look at the number of good and bad loans through the months of the year')

f = plt.gcf()
f.set_figwidth(5) 

In [None]:
closedDates.groupby(['day','good_bad_flag']).size().unstack().plot(kind='bar',stacked=True,rot=0)
plt.xlabel('Months of the year')
plt.ylabel('Number of bad/good loans')
plt.title('A look at the number of good and bad loans through the days of a month')

f = plt.gcf()
f.set_figwidth(15) 

In [None]:
plt.figure(figsize=(15,30))
closedDates.groupby(['hourOfDay','good_bad_flag']).size().unstack().plot(kind='bar',stacked=True)
plt.xlabel('Years')
plt.ylabel('Number of bad/good loans')

plt.title('A look at the number of good and bad loans through the days of a month')

f = plt.gcf()
f.set_figwidth(15) 

I do not think that the year serves as a significant feature to train this model and therefore it will not be used going forward. The months, day and hour of the day serve as good features to use to train this model.

Let's also look at the duration of the lifetime of the loan i.e from when it was created to when it was fully paid back. 

In [None]:
previousCustomers['closeddate'] = pd.to_datetime(previousCustomers['closeddate'])
previousCustomers['creationdate'] = pd.to_datetime(previousCustomers['creationdate'])

In [None]:
previousCustomers.info()

In [None]:
previousCustomers['loanlifewithdelta'] = ((previousCustomers.closeddate - previousCustomers.creationdate)/np.timedelta64(1, 'D'))
previousCustomers['loanlifenodelta'] = previousCustomers.closeddate - previousCustomers.creationdate
previousCustomers.head()

We will use the loanlifewithdelta column as a new feature to train our model with

In [None]:
previousCustomers.drop(['loanlifenodelta'],axis=1, inplace=True)

##### referredby column

In [None]:
previousCustomers['referredby'].isnull().sum()

In [None]:
len(previousCustomers['referredby'].unique())

In [None]:
previousCustomers['referredby'].unique()

The referredby column has a high cardinality and the column value doesn't particularly give any relevance to the model. The details of the referee may be useful though and so will be considered.

##### first_due_date and first_repaid_date columns

This columns individually don't provide significant features but when used together might prove to be more useful.
The difference between the two columns can tell us if a customer was prompt on making their payments.

In [None]:
loanrepayment = previousCustomers[['firstduedate','firstrepaiddate','good_bad_flag']]

In [None]:
loanrepayment.head()

In [None]:
loanrepayment['firstduedate'] = pd.to_datetime(loanrepayment['firstduedate'])
loanrepayment['firstrepaiddate'] = pd.to_datetime(loanrepayment['firstrepaiddate'])

In [None]:
loanrepayment['firstrepaymentlapsewithdelta'] = ((loanrepayment.firstduedate - loanrepayment.firstrepaiddate)/np.timedelta64(1, 'D'))
loanrepayment['firstrepaymentlapsewithnodelta'] = loanrepayment.firstduedate - loanrepayment.firstrepaiddate
loanrepayment.head()

In [None]:
##thinking of having a column showing the days and another column saying whether it was before or after the due date

The next cells are looking at the data from the performance dataframe. Those columns that were also present in the previous dataframe and were dropped as features above, will also be dropped as features below. 

In [None]:
previousCustomers.info()

In [None]:
len(previousCustomers['perf_loannumber'].unique())

In [None]:
previousCustomers['perf_loannumber'].unique()

In [None]:
testperf = pd.read_csv('testperf.csv')
testperf['loannumber'].unique()

In [None]:
len(testperf['loannumber'].unique())

##### stopped on trying to ascertain whether this feature can be used for model training.