In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
sns.set(style="darkgrid")


In [2]:
loans = pd.read_csv('kiva_loans.csv')

FileNotFoundError: [Errno 2] File b'kiva_loans.csv' does not exist: b'kiva_loans.csv'

In [None]:
loans.head()

In [None]:
loans[loans.country == 'Mali'].sector.value_counts().plot(kind = 'bar')

In [None]:
mali = loans[loans.country == 'Mali']
malifood = mali[mali.sector == 'Agriculture']
malifood

In [None]:
chart = sns.catplot(x = 'sector', y = 'loan_amount', col = 'country', kind = 'bar', data = loans, col_wrap = 4)
chart.set_xticklabels(rotation=65, horizontalalignment='right')

# Loan Borrowers by Gender

We can see that females make up 63% of the entire borrower populace and have a wider loan distribution as compared to men and groups.

In [None]:
"""show the percentage of borrowers by gender"""
loans.borrower_genders.value_counts(True)

In [None]:
"""Change borrowers with multiple people into groups"""
def gender (x):
    if x['borrower_genders'] == 'female' :
        return 'female'
    elif x['borrower_genders'] == 'male':
        return 'male'
    else:
        return 'group'

loans['gender'] = loans.apply (lambda loans: gender(loans), axis=1)

In [None]:
loans.loan_amount.describe()

In [None]:
"""Women have a wider loan distribution where as groups and men are more centered at the lower end of the distribution"""
g = sns.FacetGrid(loans, hue="gender", height = 10, aspect = 2)
g.map(sns.kdeplot, 'loan_amount').add_legend()
plt.show()

In [None]:
sns.catplot(y = 'loan_amount', col = 'gender', data = loans, kind = 'violin', col_wrap = 4)

# Borrowers by Countries

There is a difference between the number of borrowers per country, and the sum of loans borrowed per country. 

The country with the highest number of borrowers & loan sum is Philippines, followed by Kenya. However in loan sum, United States comes in third.

Haiti has the highest loan amount borrowed at 10,000 USD to create more than 300 jobs for women and farmers in the agriculture sector.

Top 3 loan amounts in Philippines are for the purpose of purchasing fair-trade coffee in the agriculture sector, and to pay for expansion of operations and staff in the education sector


In [None]:
loans.country.value_counts(ascending = True).plot(kind = 'barh', figsize=(25, 20))
plt.title('Number of Loans per Country')

In [None]:
loans.groupby('country').loan_amount.sum().sort_values(ascending = True).plot(kind = 'barh', figsize=(25, 20))
plt.title('Loan amount per Country')

In [None]:
#print(loans[loans.loan_amount > 50000])
loans.loc[[70499]]

In [None]:
"""Highest loan amount from Philippines"""
#loans[loans.country == 'Philippines'].loan_amount.sort_values(ascending = False)
loans.loc[[543706, 273601, 121246]]

In [None]:
sns.catplot(y = 'loan_amount', x = 'gender', col = 'country', kind = 'box', data = loans, col_wrap = 4)

# Borrowers by Sector

Agriculture is the highest in terms of number of loans, followed by food and then retail sector.

However in terms of loan amount, the entertainment sector is number 1 and whoesale at 2. This is not reflected if the loan borrowers are in groups.

In [None]:
loans.sector.value_counts().plot.pie(figsize = (10,10))
loans.sector.value_counts()

In [None]:
chart = sns.catplot(x = 'sector', y = 'loan_amount', kind = 'bar', data = loans)
chart.set_xticklabels(rotation=65, horizontalalignment='right')

In [None]:
chart = sns.catplot(x = 'sector', y = 'loan_amount', kind = 'bar', col = "gender", data = loans)
chart.set_xticklabels(rotation=65, horizontalalignment='right')

# Repayment Intervals

Only Kenya has weekly Repayment intervals

Philippines have the highest irregular repayment interval

In general, monthly repayment plans are taken mostly followed by irregular

In [None]:
loans.repayment_interval.value_counts().plot.pie(figsize = (10,10))

In [None]:
gender_repayment = ['gender', 'repayment_interval']
cm = sns.light_palette("red", as_cmap=True)
pd.crosstab(loans[gender_repayment[0]], loans[gender_repayment[1]]).style.background_gradient(cmap = cm)

In [None]:
sector_repayment = ['sector', 'repayment_interval']
pd.crosstab(loans[sector_repayment[0]], loans[sector_repayment[1]]).style.background_gradient(cmap = cm)

In [None]:
country_repayment = ['country', 'repayment_interval']
pd.crosstab(loans[country_repayment[0]], loans[country_repayment[1]]).style.background_gradient(cmap = cm)

# Map View of Locations(region) of Loans

In [None]:
import folium
from folium.plugins import MarkerCluster


loc = pd.read_csv('kiva_mpi_region_locations.csv')
loc2 = loc.dropna(subset=['lat', 'lon'])

loc2.dtypes

In [None]:
m = folium.Map(zoom_start = 10, tiles='cartodbpositron')

marker_cluster = MarkerCluster(name='loans in region').add_to(m)

for i in loc2[0:len(loc2)].iterrows():
    folium.CircleMarker(location=[i [1]['lat'], i [1]['lon']], popup = i [1]['LocationName'], radius=1, color='#3186cc', fill_color='#3186cc').add_to(marker_cluster) 

In [None]:
m

In [None]:
m.save('kiva loans by region.html')

# Additional: ML Classifier of number of lenders based on Tags

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
vectorizer = CountVectorizer()

In [None]:
#Adding labels to the dataframe

loans2 = loans.dropna()
loans2['labels'] = loans2['lender_count'].apply(lambda x: 'more than 10' if x > 10 else '10 and below')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(loans2.tags, loans2.labels, test_size = 0.2)
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.fit_transform(x_test)

In [None]:
"""Using Naive Bayes"""
clf = MultinomialNB().fit(x_train_counts, y_train)
clf.fit(x_test_counts, y_test)
predicted = clf.predict(x_test_counts)
np.mean(predicted == y_test)  

In [None]:
"""Using SVM"""

"""SVM has a higher score than Bayes but I think Bayes might be a better method here as 
\the tags are mostly individualistic in nature. Also with only a 65+-% score, this feature is not really something that
\would work well."""

from sklearn.linear_model import SGDClassifier
clf2 = SGDClassifier().fit(x_train_counts, y_train)
clf2.fit(x_test_counts, y_test)
predicted2 = clf2.predict(x_test_counts)
np.mean(predicted2 == y_test)  

In [None]:
"""Also testing out to see if gender has any role to play with the number of lenders and 65% is not very conclusive"""

x1_train, x1_test, y1_train, y1_test = train_test_split(loans2.borrower_genders, loans2.labels, test_size = 0.2)
x1_train_counts = count_vect.fit_transform(x1_train)
x1_test_counts = count_vect.fit_transform(x1_test)

clf3 = MultinomialNB().fit(x1_train_counts, y1_train)
clf3.fit(x1_test_counts, y1_test)

predicted3 = clf3.predict(x1_test_counts)
np.mean(predicted3 == y1_test)  

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))
print(metrics.classification_report(y1_test, predicted))