In [6]:
#import packages
import pandas as pd 
import numpy as np
import pandas_profiling
import pycountry as p
import pycountry_convert as pc
from pandas_profiling.utils.cache import cache_file
from sklearn.preprocessing import OneHotEncoder
import datetime as dt
pd.set_option('display.float_format', lambda x: '%.1f' % x)
df = pd.read_csv('wrangled_loans.csv')
#Create country codes & continent columns for each entry
country_list = [c.name for c in list(p.countries)]
country_codes = [c.alpha_2 for c in list(p.countries)]
country_codes.remove('TL')
df['COUNTRY_CODE'] = df['COUNTRY_NAME'].apply(lambda x: pc.country_name_to_country_alpha2(x) if x in country_list else None)
df['CONTINENT']= df['COUNTRY_CODE'].apply(lambda x:  pc.country_alpha2_to_continent_code(x) if x in country_codes else None)
df['CONTINENT'].value_counts()

AS    780698
AF    510841
SA    238973
NA    224713
OC     30575
EU     10722
Name: CONTINENT, dtype: int64

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,COUNTRY_CODE,CONTINENT
0,0,Spanish,825.0,funded,Furniture Making,Manufacturing,Ecuador,USD,female,True,monthly,field_partner,EC,SA
1,1,English,525.0,funded,Poultry,Agriculture,Philippines,PHP,female,True,monthly,field_partner,PH,AS
2,2,Spanish,400.0,funded,Personal Housing Expenses,Housing,Peru,PEN,male,True,monthly,field_partner,PE,SA
3,3,English,225.0,funded,Farming,Agriculture,Philippines,PHP,female,True,monthly,field_partner,PH,AS
4,4,English,1575.0,funded,Grocery Store,Food,Tajikistan,TJS,female,True,monthly,field_partner,TJ,AS


In [8]:
#Create a column with the # of people in each group
df['num_Borrowers'] = [len(x.split()) for x in df['BORROWER_GENDERS']]
#Create logic for filtering gender down to an integer
conditions = [
    (df['BORROWER_GENDERS'] == 'male'),
    (df['BORROWER_GENDERS'] == 'female')]
choices = ['male', 'female']
df['gender'] = np.select(conditions, choices, default='group')
#Same Process for whether borrower was pictured
conditions = [
    (df['BORROWER_PICTURED'] == 'true'),
    (df['BORROWER_PICTURED'] == 'false')]
choices = [1, 0]
df['pictured'] = np.select(conditions, choices, default=0)
#Columns for 
conditions = [
    (df['CURRENCY'] == 'USD'),
    (df['CURRENCY'] != 'USD')]
choices = [1, 0]
df['is_USD'] = np.select(conditions, choices, default=0)
conditions = [
    (df['CURRENCY'] == 'USD'),
    (df['CURRENCY'] != 'USD')]
choices = [1, 0]
df['is_USD'] = np.select(conditions, choices, default=0)
df = df.drop('Unnamed: 0',axis=1)
df.sample(10)

Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,COUNTRY_CODE,CONTINENT,num_Borrowers,gender,pictured,is_USD
637497,Spanish,975.0,funded,Paper Sales,Retail,Ecuador,USD,female,true,monthly,field_partner,EC,SA,1,female,1,1
1157889,English,2400.0,funded,Beauty Salon,Services,Iraq,USD,female,true,monthly,field_partner,IQ,AS,1,female,1,1
855194,Russian,1425.0,funded,Services,Services,Tajikistan,TJS,male,true,monthly,field_partner,TJ,AS,1,male,1,0
1725875,Spanish,875.0,funded,General Store,Retail,Nicaragua,NIO,female,true,monthly,field_partner,NI,,1,female,1,0
1775828,English,2000.0,funded,Retail,Retail,United States,USD,female,true,monthly,direct,US,,1,female,1,1
1288502,English,75.0,funded,Manufacturing,Manufacturing,Philippines,PHP,female,true,monthly,field_partner,PH,AS,1,female,1,0
28513,English,1100.0,funded,Farming,Agriculture,Cambodia,USD,female,true,monthly,field_partner,KH,AS,1,female,1,1
503780,Spanish,475.0,funded,Personal Medical Expenses,Health,Nicaragua,NIO,female,true,monthly,field_partner,NI,,1,female,1,0
1299957,English,825.0,expired,General Store,Retail,Samoa,WST,female,true,monthly,field_partner,WS,OC,1,female,1,0
1055495,English,425.0,funded,Fish Selling,Food,Liberia,LRD,"female, female, female, female, female","true, true, true, true, true",monthly,field_partner,LR,AF,5,group,0,0


In [9]:
#Encode categorical variables into new binary columns
def encode_Vars(df,var_list):
    return_df = pd.DataFrame()
    for var in var_list:
        ohe = OneHotEncoder(dtype = np.int)
        ohe_results = ohe.fit_transform(df[[var]])
        df = df.join(pd.DataFrame(ohe_results.toarray(), columns=ohe.categories_))
    return df
encoded = encode_Vars(df,['SECTOR_NAME','gender','REPAYMENT_INTERVAL','ORIGINAL_LANGUAGE','DISTRIBUTION_MODEL'])
encoded.head(10)

Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,...,"(monthly,)","(Arabic,)","(English,)","(French,)","(Indonesian,)","(Portuguese,)","(Russian,)","(Spanish,)","(direct,)","(field_partner,)"
0,Spanish,825.0,funded,Furniture Making,Manufacturing,Ecuador,USD,female,true,monthly,...,1,0,0,0,0,0,0,1,0,1
1,English,525.0,funded,Poultry,Agriculture,Philippines,PHP,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
2,Spanish,400.0,funded,Personal Housing Expenses,Housing,Peru,PEN,male,true,monthly,...,1,0,0,0,0,0,0,1,0,1
3,English,225.0,funded,Farming,Agriculture,Philippines,PHP,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
4,English,1575.0,funded,Grocery Store,Food,Tajikistan,TJS,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
5,English,450.0,funded,Food Production/Sales,Food,Samoa,WST,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
6,French,775.0,funded,Retail,Retail,Togo,XOF,"female, female, female, female, male, female","true, true, true, true, true, true",monthly,...,1,0,0,1,0,0,0,0,0,1
7,English,175.0,funded,General Store,Retail,Philippines,PHP,male,true,monthly,...,1,0,1,0,0,0,0,0,0,1
8,English,200.0,funded,Rickshaw,Transportation,Pakistan,PKR,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1
9,English,750.0,funded,Laundry,Services,Cambodia,USD,female,true,monthly,...,1,0,1,0,0,0,0,0,0,1


In [10]:
drop_cols = ['ORIGINAL_LANGUAGE','ACTIVITY_NAME','SECTOR_NAME','COUNTRY_NAME','CURRENCY','BORROWER_GENDERS','BORROWER_PICTURED','DISTRIBUTION_MODEL']
encoded = encoded.drop(columns = drop_cols)
encoded.info()
encoded.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1804881 entries, 0 to 1804880
Data columns (total 39 columns):
 #   Column              Dtype  
---  ------              -----  
 0   LOAN_AMOUNT         float64
 1   STATUS              object 
 2   REPAYMENT_INTERVAL  object 
 3   COUNTRY_CODE        object 
 4   CONTINENT           object 
 5   num_Borrowers       int64  
 6   gender              object 
 7   pictured            int64  
 8   is_USD              int64  
 9   (Agriculture,)      int64  
 10  (Arts,)             int64  
 11  (Clothing,)         int64  
 12  (Construction,)     int64  
 13  (Education,)        int64  
 14  (Entertainment,)    int64  
 15  (Food,)             int64  
 16  (Health,)           int64  
 17  (Housing,)          int64  
 18  (Manufacturing,)    int64  
 19  (Personal Use,)     int64  
 20  (Retail,)           int64  
 21  (Services,)         int64  
 22  (Transportation,)   int64  
 23  (Wholesale,)        int64  
 24  (female,)           int6

Unnamed: 0,LOAN_AMOUNT,STATUS,REPAYMENT_INTERVAL,COUNTRY_CODE,CONTINENT,num_Borrowers,gender,pictured,is_USD,"(Agriculture,)",...,"(monthly,)","(Arabic,)","(English,)","(French,)","(Indonesian,)","(Portuguese,)","(Russian,)","(Spanish,)","(direct,)","(field_partner,)"
366913,400.0,funded,monthly,PH,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
1265581,300.0,funded,monthly,PK,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
694766,250.0,funded,monthly,PH,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
1623391,175.0,funded,monthly,TR,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
908845,2175.0,funded,monthly,MX,,9,group,0,0,0,...,1,0,0,0,0,0,0,1,0,1
989577,250.0,funded,monthly,PH,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
1591769,250.0,funded,monthly,PH,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
69505,1450.0,funded,monthly,MN,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
215705,200.0,funded,monthly,PH,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
1780742,150.0,funded,monthly,PH,AS,1,female,1,0,0,...,1,0,1,0,0,0,0,0,0,1
