In [113]:
#import packages
import pandas as pd 
import numpy as np
import pandas_profiling
from pandas_profiling.utils.cache import cache_file
from sklearn.preprocessing import OneHotEncoder
import datetime as dt
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [114]:
data = pd.read_csv("loans.csv")

In [115]:
data.head()

Unnamed: 0,LOAN_ID,LOAN_NAME,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,FUNDED_AMOUNT,LOAN_AMOUNT,STATUS,IMAGE_ID,VIDEO_ID,...,LENDER_TERM,NUM_LENDERS_TOTAL,NUM_JOURNAL_ENTRIES,NUM_BULK_ENTRIES,TAGS,BORROWER_NAMES,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
0,822769,Ana Olaya,Spanish,En el cantón 24 De Mayo de la provincia de Ma...,"In the 24 De Mayo canton, Manabí province, kno...",825.0,825.0,funded,1772632.0,,...,8.0,9,1,1,,Ana Olaya,female,True,monthly,field_partner
1,758447,JOVELINE\t,English,Joveline is married and has two children. She ...,Joveline is married and has two children. She ...,525.0,525.0,funded,1677756.0,,...,8.0,12,2,1,"#Animals, #Parent, #Woman-Owned Business",JOVELINE\t,female,True,monthly,field_partner
2,443481,JUAN ANGEL,Spanish,"Juan es un hombre de 54 años, es felizmente ca...","Juan, 54, is happily married. He lives with h...",400.0,400.0,funded,1127992.0,,...,14.0,15,2,1,,JUAN ANGEL,male,True,monthly,field_partner
3,943914,Delilah,English,Delilah is 45 years old and married with four ...,Delilah is 45 years old and married with four ...,225.0,225.0,funded,1972607.0,,...,8.0,9,1,1,"#Repeat Borrower, #Vegan, #Trees, #Parent, #Wo...",Delilah,female,True,monthly,field_partner
4,149321,Hurmatoy,English,K. Hurmatoy is a happy mother of three childre...,,1575.0,1575.0,funded,418838.0,,...,14.0,28,1,1,,Hurmatoy,female,True,monthly,field_partner


In [116]:
#Check for any duplicate loans 
len(data['LOAN_ID'].unique()) - len(data['LOAN_ID'].unique())

0

In [117]:
#Some features that could be focused on: sector,number of lenders,gender of borrows, whether or not a picture of the borrow is included, loan amount, and country
keep_cols = ['ORIGINAL_LANGUAGE','LOAN_AMOUNT','STATUS','ACTIVITY_NAME','SECTOR_NAME','COUNTRY_NAME','CURRENCY','BORROWER_GENDERS','BORROWER_PICTURED','REPAYMENT_INTERVAL','DISTRIBUTION_MODEL']
data = data[keep_cols];
data.columns

Index(['ORIGINAL_LANGUAGE', 'LOAN_AMOUNT', 'STATUS', 'ACTIVITY_NAME',
       'SECTOR_NAME', 'COUNTRY_NAME', 'CURRENCY', 'BORROWER_GENDERS',
       'BORROWER_PICTURED', 'REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL'],
      dtype='object')

In [118]:
# We want to focus on loans that are either funded or expired (meaning they did not hit their fundraising goals)
keep_rows = data['STATUS'].isin(['expired','funded'])
data = data[keep_rows]
data.STATUS.value_counts()

funded     1867176
expired      90948
Name: STATUS, dtype: int64

In [119]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1958124 entries, 0 to 1979467
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ORIGINAL_LANGUAGE   object 
 1   LOAN_AMOUNT         float64
 2   STATUS              object 
 3   ACTIVITY_NAME       object 
 4   SECTOR_NAME         object 
 5   COUNTRY_NAME        object 
 6   CURRENCY            object 
 7   BORROWER_GENDERS    object 
 8   BORROWER_PICTURED   object 
 9   REPAYMENT_INTERVAL  object 
 10  DISTRIBUTION_MODEL  object 
dtypes: float64(1), object(10)
memory usage: 179.3+ MB


In [120]:
data.isna().sum(axis = 0)

ORIGINAL_LANGUAGE     36697
LOAN_AMOUNT               0
STATUS                    0
ACTIVITY_NAME             0
SECTOR_NAME               0
COUNTRY_NAME              0
CURRENCY                  0
BORROWER_GENDERS      36697
BORROWER_PICTURED     36697
REPAYMENT_INTERVAL        0
DISTRIBUTION_MODEL        0
dtype: int64

In [121]:
#Drop rows with any missing values
data = data.dropna()

In [122]:
#Create a column with the # of people in each group
data['num_Borrowers'] = [len(x.split()) for x in data['BORROWER_GENDERS']]
#Create logic for filtering gender down to an integer
conditions = [
    (data['BORROWER_GENDERS'] == 'male'),
    (data['BORROWER_GENDERS'] == 'female')]
choices = ['male', 'female']
data['gender'] = np.select(conditions, choices, default='group')
#Same Process for whether borrower was pictured
conditions = [
    (data['BORROWER_PICTURED'] == 'true'),
    (data['BORROWER_PICTURED'] == 'false')]
choices = [1, 0]
data['pictured'] = np.select(conditions, choices, default=0)
data.sample(10)

Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,num_Borrowers,gender,pictured
386307,Spanish,1000.0,funded,Clothing Sales,Clothing,El Salvador,USD,female,true,monthly,field_partner,1,female,1
15445,Spanish,1475.0,funded,Retail,Retail,Paraguay,PYG,female,true,monthly,field_partner,1,female,1
970403,Spanish,550.0,funded,Shoe Sales,Retail,Ecuador,USD,female,true,monthly,field_partner,1,female,1
973341,English,125.0,funded,Grocery Store,Food,Kenya,KES,female,true,monthly,direct,1,female,1
1000169,English,500.0,funded,Agriculture,Agriculture,Samoa,WST,female,true,monthly,field_partner,1,female,1
924737,English,850.0,funded,Embroidery,Arts,Jordan,JOD,female,true,monthly,field_partner,1,female,1
519200,English,425.0,funded,Fish Selling,Food,Philippines,PHP,female,true,monthly,field_partner,1,female,1
1488092,English,8525.0,funded,Fish Selling,Food,Rwanda,RWF,"female, female, female, female, female, female...","true, true, true, true, true, true, true, true...",monthly,field_partner,20,group,0
489370,French,300.0,funded,Clothing,Clothing,Mali,XOF,"female, female, female, female, female, female...","true, true, true, true, true, true, true, true...",monthly,field_partner,10,group,0
565249,Spanish,375.0,funded,Restaurant,Food,Peru,PEN,female,true,monthly,field_partner,1,female,1


In [123]:
#Encode categorical variables into new binary columns
def encode_Vars(df,var_list):
    return_df = pd.DataFrame()
    for var in var_list:
        ohe = OneHotEncoder(dtype = np.int)
        ohe_results = ohe.fit_transform(df[[var]])
        df = df.join(pd.DataFrame(ohe_results.toarray(), columns=ohe.categories_))
    return df
encoded = encode_Vars(data,['SECTOR_NAME','gender','REPAYMENT_INTERVAL','ORIGINAL_LANGUAGE'])
encoded.head(10)

Unnamed: 0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,CURRENCY,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,...,"(irregular,)","(monthly,)","(Arabic,)","(English,)","(French,)","(Indonesian,)","(Portuguese,)","(Russian,)","(Spanish,)","(Vietnamese,)"
0,Spanish,825.0,funded,Furniture Making,Manufacturing,Ecuador,USD,female,true,monthly,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,English,525.0,funded,Poultry,Agriculture,Philippines,PHP,female,true,monthly,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Spanish,400.0,funded,Personal Housing Expenses,Housing,Peru,PEN,male,true,monthly,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,English,225.0,funded,Farming,Agriculture,Philippines,PHP,female,true,monthly,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,English,1575.0,funded,Grocery Store,Food,Tajikistan,TJS,female,true,monthly,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,English,450.0,funded,Food Production/Sales,Food,Samoa,WST,female,true,monthly,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,French,775.0,funded,Retail,Retail,Togo,XOF,"female, female, female, female, male, female","true, true, true, true, true, true",monthly,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,English,175.0,funded,General Store,Retail,Philippines,PHP,male,true,monthly,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,English,200.0,funded,Rickshaw,Transportation,Pakistan,PKR,female,true,monthly,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,English,750.0,funded,Laundry,Services,Cambodia,USD,female,true,monthly,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
drop_cols = ['ORIGINAL_LANGUAGE','ACTIVITY_NAME','SECTOR_NAME','COUNTRY_NAME','CURRENCY','BORROWER_GENDERS','BORROWER_PICTURED','REPAYMENT_INTERVAL','DISTRIBUTION_MODEL']
encoded = encoded.drop(columns = drop_cols)
encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1921427 entries, 0 to 1979467
Data columns (total 34 columns):
 #   Column             Dtype  
---  ------             -----  
 0   LOAN_AMOUNT        float64
 1   STATUS             object 
 2   num_Borrowers      int64  
 3   gender             object 
 4   pictured           int64  
 5   (Agriculture,)     float64
 6   (Arts,)            float64
 7   (Clothing,)        float64
 8   (Construction,)    float64
 9   (Education,)       float64
 10  (Entertainment,)   float64
 11  (Food,)            float64
 12  (Health,)          float64
 13  (Housing,)         float64
 14  (Manufacturing,)   float64
 15  (Personal Use,)    float64
 16  (Retail,)          float64
 17  (Services,)        float64
 18  (Transportation,)  float64
 19  (Wholesale,)       float64
 20  (female,)          float64
 21  (group,)           float64
 22  (male,)            float64
 23  (bullet,)          float64
 24  (irregular,)       float64
 25  (monthly,)        