In [6]:
#import packages
import pandas as pd 
import numpy as np
import pandas_profiling
import pycountry as p
import pycountry_convert as pc
from pandas_profiling.utils.cache import cache_file
from sklearn.preprocessing import OneHotEncoder
import datetime as dt
pd.set_option('display.float_format', lambda x: '%.1f' % x)
df = pd.read_csv('wrangled_loans.csv')
#Create country codes & continent columns for each entry
country_list = [c.name for c in list(p.countries)]
country_codes = [c.alpha_2 for c in list(p.countries)]
country_codes.remove('TL')
df['COUNTRY_CODE'] = df['COUNTRY_NAME'].apply(lambda x: pc.country_name_to_country_alpha2(x) if x in country_list else None)
df['CONTINENT']= df['COUNTRY_CODE'].apply(lambda x:  pc.country_alpha2_to_continent_code(x) if x in country_codes else None)
df['CONTINENT'].value_counts()

AS    780698
AF    510841
SA    238973
NA    224713
OC     30575
EU     10722
Name: CONTINENT, dtype: int64

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,COUNTRY_CODE,CONTINENT
0,0,Spanish,825.0,funded,Furniture Making,Manufacturing,Ecuador,USD,female,True,monthly,field_partner,EC,SA
1,1,English,525.0,funded,Poultry,Agriculture,Philippines,PHP,female,True,monthly,field_partner,PH,AS
2,2,Spanish,400.0,funded,Personal Housing Expenses,Housing,Peru,PEN,male,True,monthly,field_partner,PE,SA
3,3,English,225.0,funded,Farming,Agriculture,Philippines,PHP,female,True,monthly,field_partner,PH,AS
4,4,English,1575.0,funded,Grocery Store,Food,Tajikistan,TJS,female,True,monthly,field_partner,TJ,AS


In [8]:
#Create a column with the # of people in each group
df['num_Borrowers'] = [len(x.split()) for x in df['BORROWER_GENDERS']]
#Create logic for filtering gender down to an integer
conditions = [
    (df['BORROWER_GENDERS'] == 'male'),
    (df['BORROWER_GENDERS'] == 'female')]
choices = ['male', 'female']
df['gender'] = np.select(conditions, choices, default='group')
#Same Process for whether borrower was pictured
conditions = [
    (df['BORROWER_PICTURED'] == 'true'),
    (df['BORROWER_PICTURED'] == 'false')]
choices = [1, 0]
df['pictured'] = np.select(conditions, choices, default=0)
#Columns for 
conditions = [
    (df['CURRENCY'] == 'USD'),
    (df['CURRENCY'] != 'USD')]
choices = [1, 0]
df['is_USD'] = np.select(conditions, choices, default=0)
conditions = [
    (df['CURRENCY'] == 'USD'),
    (df['CURRENCY'] != 'USD')]
choices = [1, 0]
df['is_USD'] = np.select(conditions, choices, default=0)
df = df.drop('Unnamed: 0',axis=1)
df.replace({'STATUS':{'funded':1,'expired':0}},inplace=True)
df.sample(10)

Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,COUNTRY_CODE,CONTINENT,num_Borrowers,gender,pictured,is_USD
47987,English,500.0,1,Farming,Agriculture,Kenya,KES,female,True,monthly,field_partner,KE,AF,1,female,1,0
1646926,English,850.0,1,Food,Food,Rwanda,RWF,female,True,monthly,field_partner,RW,AF,1,female,1,0
99987,English,500.0,1,Tailoring,Services,Philippines,PHP,female,True,monthly,field_partner,PH,AS,1,female,1,0
1609867,English,300.0,1,General Store,Retail,Philippines,PHP,female,True,monthly,field_partner,PH,AS,1,female,1,0
883918,English,200.0,1,General Store,Retail,Philippines,PHP,female,True,monthly,field_partner,PH,AS,1,female,1,0
1034608,English,1475.0,1,Fish Selling,Food,Fiji,FJD,female,True,monthly,field_partner,FJ,OC,1,female,1,0
1513324,Spanish,600.0,1,Services,Services,Colombia,COP,male,True,monthly,field_partner,CO,SA,1,male,1,0
149353,Russian,1025.0,1,Livestock,Agriculture,Tajikistan,TJS,male,True,irregular,field_partner,TJ,AS,1,male,1,0
799243,English,350.0,1,Fruits & Vegetables,Food,Philippines,PHP,female,True,monthly,field_partner,PH,AS,1,female,1,0
30219,Spanish,525.0,1,Tailoring,Services,Nicaragua,NIO,male,True,monthly,field_partner,NI,,1,male,1,0


In [9]:
#Encode categorical variables into new binary columns
def encode_Vars(df,var_list):
    return_df = pd.DataFrame()
    for var in var_list:
        ohe = OneHotEncoder(dtype = np.int)
        ohe_results = ohe.fit_transform(df[[var]])
        df = df.join(pd.DataFrame(ohe_results.toarray(), columns=ohe.categories_))
    return df
encoded = encode_Vars(df,['SECTOR_NAME','gender','REPAYMENT_INTERVAL','ORIGINAL_LANGUAGE','DISTRIBUTION_MODEL'])
encoded.head(10)

Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,...,"(monthly,)","(Arabic,)","(English,)","(French,)","(Indonesian,)","(Portuguese,)","(Russian,)","(Spanish,)","(direct,)","(field_partner,)"
0,Spanish,825.0,1,Furniture Making,Manufacturing,Ecuador,USD,female,true,monthly,...,1,0,0,0,0,0,0,1,0,1
1,English,525.0,1,Poultry,Agriculture,Philippines,PHP,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
2,Spanish,400.0,1,Personal Housing Expenses,Housing,Peru,PEN,male,true,monthly,...,1,0,0,0,0,0,0,1,0,1
3,English,225.0,1,Farming,Agriculture,Philippines,PHP,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
4,English,1575.0,1,Grocery Store,Food,Tajikistan,TJS,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
5,English,450.0,1,Food Production/Sales,Food,Samoa,WST,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
6,French,775.0,1,Retail,Retail,Togo,XOF,"female, female, female, female, male, female","true, true, true, true, true, true",monthly,...,1,0,0,1,0,0,0,0,0,1
7,English,175.0,1,General Store,Retail,Philippines,PHP,male,true,monthly,...,1,0,1,0,0,0,0,0,0,1
8,English,200.0,1,Rickshaw,Transportation,Pakistan,PKR,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
9,English,750.0,1,Laundry,Services,Cambodia,USD,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1


In [10]:
drop_cols = ['CONTINENT','REPAYMENT_INTERVAL','COUNTRY_CODE','gender','ORIGINAL_LANGUAGE','ACTIVITY_NAME','SECTOR_NAME','COUNTRY_NAME','CURRENCY','BORROWER_GENDERS','BORROWER_PICTURED','DISTRIBUTION_MODEL']
encoded = encoded.drop(columns = drop_cols)
encoded.info()
encoded.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1804881 entries, 0 to 1804880
Data columns (total 35 columns):
 #   Column             Dtype  
---  ------             -----  
 0   LOAN_AMOUNT        float64
 1   STATUS             int64  
 2   num_Borrowers      int64  
 3   pictured           int64  
 4   is_USD             int64  
 5   (Agriculture,)     int64  
 6   (Arts,)            int64  
 7   (Clothing,)        int64  
 8   (Construction,)    int64  
 9   (Education,)       int64  
 10  (Entertainment,)   int64  
 11  (Food,)            int64  
 12  (Health,)          int64  
 13  (Housing,)         int64  
 14  (Manufacturing,)   int64  
 15  (Personal Use,)    int64  
 16  (Retail,)          int64  
 17  (Services,)        int64  
 18  (Transportation,)  int64  
 19  (Wholesale,)       int64  
 20  (female,)          int64  
 21  (group,)           int64  
 22  (male,)            int64  
 23  (bullet,)          int64  
 24  (irregular,)       int64  
 25  (monthly,)        

Unnamed: 0,LOAN_AMOUNT,STATUS,num_Borrowers,pictured,is_USD,"(Agriculture,)","(Arts,)","(Clothing,)","(Construction,)","(Education,)",...,"(monthly,)","(Arabic,)","(English,)","(French,)","(Indonesian,)","(Portuguese,)","(Russian,)","(Spanish,)","(direct,)","(field_partner,)"
1245899,125.0,1,1,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
302278,1300.0,1,1,1,1,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
783968,1000.0,1,4,0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
956170,125.0,1,1,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
194354,875.0,1,1,1,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,1
698573,2000.0,1,1,1,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
683367,1000.0,1,1,1,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
364843,500.0,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1281385,800.0,1,1,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1397637,100.0,1,1,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1


In [11]:
encoded.to_csv('loan_features.csv')