# Simulate loan application data

This notebook simulates load application data and exports it to csv, suitable for use with the PyConZA 2017 presentation notebook.

The simulation creates the following fields:

    - total_liabilities_in_base_currency
    - default_gbp_rate
    - average_3_month_salary
    - average_3_month_salary_gbp_rate
    - country_of_habitual_residence
    - latest_step
    - other_assets
    - other_assets_gbp_rate
    - payback_period
    - pre_study_country
    - property_value
    - property_value_gbp_rate
    - school_type
    - school_university_name

In [1]:
import pandas as pd
import numpy as np

In [2]:
def simulate_borrower_application_data(n_rows):
    """Create simulated loan application data.
    
    Parameters
    ----------
    nrows : int
        Number of application rows to create
        
    Returns
    -------
    DataFrame
        DataFrame of simulated borrower application data, with columns:
            total_liabilities_in_base_currency
            default_gbp_rate
            average_3_month_salary
            average_3_month_salary_gbp_rate
            country_of_habitual_residence
            latest_step
            other_assets
            other_assets_gbp_rate
            payback_period
            pre_study_country
            property_value
            property_value_gbp_rate
            school_type
            school_university_name
        
    """
    # create randomish float type borrower info
    total_liabilities_in_base_currency = np.abs(np.random.normal(6000., 17000., n_rows))
    average_3_month_salary = np.abs(np.random.normal(1800., 800., n_rows))
    other_assets = np.abs(np.random.normal(400., 600., n_rows))
    property_value = np.abs(np.random.normal(1200., 700., n_rows))
    
    # add some NaNs
    property_value[np.random.randint(0, n_rows, np.max([n_rows//6, 1]))] = np.nan
    other_assets[np.random.randint(0, n_rows, np.max([n_rows//10, 1]))] = np.nan
    average_3_month_salary[np.random.randint(0, n_rows, np.max([n_rows//8, 1]))] = np.nan
    total_liabilities_in_base_currency[np.random.randint(0, n_rows, np.max([n_rows//6, 1]))] = np.nan

    # create random float type exchage rate info
    default_gbp_rate = np.abs(np.random.normal(1., 0.3, n_rows))
    average_3_month_salary_gbp_rate = np.abs(np.random.normal(1., 0.3, n_rows))
    other_assets_gbp_rate = np.abs(np.random.normal(1., 0.3, n_rows))
    property_value_gbp_rate = np.abs(np.random.normal(1., 0.3, n_rows))

    # create randomish two letter country code string type borrower info
    country_values = np.array([
       'GB', 'PT', 'FR', 'IN', 'US', 'CN', 'CH', 'AR', 'LU', 'ZA', 'MG'
       'HU', 'RS', 'BR', 'SG', 'RU', 'GR', 'PE', 'LT', 'CL', 'JO', 'KE',
       'GH', 'HK', 'AU', 'DE', 'CO', 'AE', 'VN', 'EC', 'MX', 'BB', 'ES',
       'VE', 'DK', 'NG', 'PK', 'IE', 'IL', 'IT', 'ID', 'NE', 'CR', 'CA',
       'AO', 'NL', 'UA', 'CI', 'ZW', 'KR', 'ET', 'PH', 'EG', 'LB', 'BD',
       'PR', 'GE', 'AF', 'TR', 'TN', 'HN', 'AM', 'SA', 'AZ', 'MD', 'RO',
       'SE', 'JM', 'BG', 'SS', 'CY', 'AT', 'PL', 'UZ', 'MU', 'IS', 'MM',
       'TH', 'KZ', 'KG', 'UG', 'TW', 'DZ', 'MY', 'CM', 'BE', 'MN', 'JP',
       'CZ', 'GT', 'MA', 'LR', 'QA', 'NO', 'ML', 'NP', 'NZ', 'UY', 'HT',
       'IM', 'SV', 'SZ', 'SN', 'BO', 'IQ', 'TZ', 'LK', 'TT', 'LV', 'PG',
       'HR', 'IR', 'MZ', 'SD', 'KY', 'OM', 'BY', 'KW', 'RW', 'AL', 'BW',
       'LS', 'GN', 'KH', 'SK', 'GA', 'FI', 'BM', 'PY', 'BJ', 'SO', 'MT',
       'SI', 'MK', 'SL', 'DO', 'EE', 'PA', 'TJ', 'VC', 'FJ', 'NI', 'GM',
       'SY', 'LY', 'CD', 'AD', 'CW', 'ZM', 'MV', 'BF', 'BA', 'BH', 'YE',
       'CG', 'PS', 'TG', 'NC', 'BN', 'MF', 'AW', 'KP', 'MW', 'PW', 'LA',
       'BS', 'AS', 'DJ', 'AX', 'KI', None
    ])
    n_countries = len(country_values)
    country_of_habitual_residence = country_values[np.random.randint(0, n_countries, n_rows)]
    pre_study_country = country_values[np.random.randint(0, n_countries, n_rows)]

    # create randomish school string type borrower info
    school_type_values = np.array(['Business', 'Engineering'])
    n_school_types = len(school_type_values)
    school_type = school_type_values[np.random.randint(0, n_school_types, n_rows)]
    school_name_values = np.array([
       'Stanford University', 'University of Pennsylvania',
       'University of Navarra', 'University of Chicago',
       'IE University', 'University of Oxford', 'University of Cambridge',
       'Columbia University', 'University of Toronto',
       'University of Michigan', 'University of Connecticut',
       'Boston College', 'McGill University', 'Cranfield University',
       'York University', 'Harvard University',
       'University of Southern California', 'Cornell University',
       'Ramon Llull University', 'New York University',
       'Lancaster University', 'University of Illinois Urbana-Champaign',
       'Carnegie Mellon University', 'University of British Columbia',
       'Babson College', 'Georgetown University', 'Boston University',
       'City University', 'Dartmouth College', 'University of Cape Town',
       'University of Strathclyde', None
    ])
    n_school_names = len(school_name_values)
    school_university_name = school_name_values[np.random.randint(0, n_school_names, n_rows)]

    # create randomish integer type borrower info
    latest_step = np.random.randint(0, 10, n_rows)
    payback_period_values = np.array([5*12, 10*12, 15*12, 20*12])
    n_payback_periods = len(payback_period_values)
    payback_period = payback_period_values[np.random.randint(0, n_payback_periods, n_rows)]
    
    r = pd.DataFrame({
        'total_liabilities_in_base_currency': total_liabilities_in_base_currency,
        'average_3_month_salary': average_3_month_salary,
        'other_assets': other_assets,
        'property_value': property_value,
        'default_gbp_rate': default_gbp_rate,
        'average_3_month_salary_gbp_rate': average_3_month_salary_gbp_rate,
        'other_assets_gbp_rate': other_assets_gbp_rate,
        'property_value_gbp_rate': property_value_gbp_rate,
        'latest_step': latest_step,
        'payback_period': payback_period,
        'country_of_habitual_residence': country_of_habitual_residence,
        'pre_study_country': pre_study_country,
        'school_type': school_type,
        'school_university_name': school_university_name
        })

    return r.round(3)

In [3]:
df_for_pyconza_talk = simulate_borrower_application_data(20000)

In [4]:
df_for_pyconza_talk.head()

Unnamed: 0,average_3_month_salary,average_3_month_salary_gbp_rate,country_of_habitual_residence,default_gbp_rate,latest_step,other_assets,other_assets_gbp_rate,payback_period,pre_study_country,property_value,property_value_gbp_rate,school_type,school_university_name,total_liabilities_in_base_currency
0,1458.997,0.866,ET,0.474,2,541.529,1.267,180,IM,325.148,0.555,Engineering,New York University,18810.277
1,2177.916,1.299,CN,1.337,6,757.306,0.802,180,MK,1668.911,1.362,Business,University of Pennsylvania,11203.819
2,1445.805,0.858,TW,0.923,6,290.339,0.579,240,PH,944.007,1.168,Engineering,University of Michigan,
3,1080.797,1.508,SG,0.65,8,602.398,1.482,60,KE,1339.878,1.076,Engineering,University of Oxford,18493.556
4,809.031,0.566,GA,1.199,5,617.201,0.806,240,FI,2643.022,1.603,Engineering,Georgetown University,


In [5]:
df_for_pyconza_talk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 14 columns):
average_3_month_salary                17626 non-null float64
average_3_month_salary_gbp_rate       20000 non-null float64
country_of_habitual_residence         19877 non-null object
default_gbp_rate                      20000 non-null float64
latest_step                           20000 non-null int64
other_assets                          18086 non-null float64
other_assets_gbp_rate                 20000 non-null float64
payback_period                        20000 non-null int64
pre_study_country                     19882 non-null object
property_value                        16939 non-null float64
property_value_gbp_rate               20000 non-null float64
school_type                           20000 non-null object
school_university_name                19376 non-null object
total_liabilities_in_base_currency    16912 non-null float64
dtypes: float64(8), int64(2), object(4)
memor

---
## Export to CSV

In [6]:
df_for_pyconza_talk.to_csv('pycon_example_data.csv', index=False)