# (Loan Data from Prosper)
## by (Marwa Qabeel)

## Preliminary Wrangling

> This data set contains 113,937 loans with 81 variables on each loan, including loan amount, borrower rate (or interest rate), current loan status, borrower income, and many others. This [data dictionary](https://docs.google.com/spreadsheets/d/1gDyi_L4UvIrLTEC6Wri5nbaMmkGmLQBk-Yx3z0XDEtI/edit?usp=sharing) explains the variables in the data set.


In [1]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

In [28]:
1/(10 * np.sqrt(2 * np.pi)) * np.exp(- (1- 50)**2 / (2 * 10**2))

2.4389607458933567e-07

In [29]:
1/(10 * np.sqrt(2 * np.pi))

0.039894228040143274

In [30]:
2 * np.pi

6.283185307179586

In [23]:
# Set the grid for the output coulmns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#### The next lines of code to adjust the style of the output tables in my notebook to have outlined edges →→→ Ignore them

In [24]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

> Load in your dataset and describe its properties through the questions below.
Try and motivate your exploration goals through this section.

In [25]:
# load in the dataset into a pandas dataframe, print statistics
loan_df = pd.read_csv("../Data/prosperLoanData.csv")

In [26]:
# Explore the dataframe size
loan_df.shape

(113937, 81)

In [27]:
# Check data types of the dataset fields
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 81 columns):
ListingKey                             113937 non-null object
ListingNumber                          113937 non-null int64
ListingCreationDate                    113937 non-null object
CreditGrade                            28953 non-null object
Term                                   113937 non-null int64
LoanStatus                             113937 non-null object
ClosedDate                             55089 non-null object
BorrowerAPR                            113912 non-null float64
BorrowerRate                           113937 non-null float64
LenderYield                            113937 non-null float64
EstimatedEffectiveYield                84853 non-null float64
EstimatedLoss                          84853 non-null float64
EstimatedReturn                        84853 non-null float64
ProsperRating (numeric)                84853 non-null float64
ProsperRating (Alpha) 

In [28]:
# Explore the first few rows
loan_df.head()

Unnamed: 0,ListingKey,ListingNumber,ListingCreationDate,CreditGrade,Term,LoanStatus,ClosedDate,BorrowerAPR,BorrowerRate,LenderYield,EstimatedEffectiveYield,EstimatedLoss,EstimatedReturn,ProsperRating (numeric),ProsperRating (Alpha),ProsperScore,ListingCategory (numeric),BorrowerState,Occupation,EmploymentStatus,EmploymentStatusDuration,IsBorrowerHomeowner,CurrentlyInGroup,GroupKey,DateCreditPulled,CreditScoreRangeLower,CreditScoreRangeUpper,FirstRecordedCreditLine,CurrentCreditLines,OpenCreditLines,TotalCreditLinespast7years,OpenRevolvingAccounts,OpenRevolvingMonthlyPayment,InquiriesLast6Months,TotalInquiries,CurrentDelinquencies,AmountDelinquent,DelinquenciesLast7Years,PublicRecordsLast10Years,PublicRecordsLast12Months,RevolvingCreditBalance,BankcardUtilization,AvailableBankcardCredit,TotalTrades,TradesNeverDelinquent (percentage),TradesOpenedLast6Months,DebtToIncomeRatio,IncomeRange,IncomeVerifiable,StatedMonthlyIncome,LoanKey,TotalProsperLoans,TotalProsperPaymentsBilled,OnTimeProsperPayments,ProsperPaymentsLessThanOneMonthLate,ProsperPaymentsOneMonthPlusLate,ProsperPrincipalBorrowed,ProsperPrincipalOutstanding,ScorexChangeAtTimeOfListing,LoanCurrentDaysDelinquent,LoanFirstDefaultedCycleNumber,LoanMonthsSinceOrigination,LoanNumber,LoanOriginalAmount,LoanOriginationDate,LoanOriginationQuarter,MemberKey,MonthlyLoanPayment,LP_CustomerPayments,LP_CustomerPrincipalPayments,LP_InterestandFees,LP_ServiceFees,LP_CollectionFees,LP_GrossPrincipalLoss,LP_NetPrincipalLoss,LP_NonPrincipalRecoverypayments,PercentFunded,Recommendations,InvestmentFromFriendsCount,InvestmentFromFriendsAmount,Investors
0,1021339766868145413AB3B,193129,2007-08-26 19:09:29.263000000,C,36,Completed,2009-08-14 00:00:00,0.16516,0.158,0.138,,,,,,,0,CO,Other,Self-employed,2.0,True,True,,2007-08-26 18:41:46.780000000,640.0,659.0,2001-10-11 00:00:00,5.0,4.0,12.0,1,24.0,3.0,3.0,2.0,472.0,4.0,0.0,0.0,0.0,0.0,1500.0,11.0,0.81,0.0,0.17,"$25,000-49,999",True,3083.333333,E33A3400205839220442E84,,,,,,,,,0,,78,19141,9425,2007-09-12 00:00:00,Q3 2007,1F3E3376408759268057EDA,330.43,11396.14,9425.0,1971.14,-133.18,0.0,0.0,0.0,0.0,1.0,0,0,0.0,258
1,10273602499503308B223C1,1209647,2014-02-27 08:28:07.900000000,,36,Current,,0.12016,0.092,0.082,0.0796,0.0249,0.0547,6.0,A,7.0,2,CO,Professional,Employed,44.0,False,False,,2014-02-27 08:28:14,680.0,699.0,1996-03-18 00:00:00,14.0,14.0,29.0,13,389.0,3.0,5.0,0.0,0.0,0.0,1.0,0.0,3989.0,0.21,10266.0,29.0,1.0,2.0,0.18,"$50,000-74,999",True,6125.0,9E3B37071505919926B1D82,,,,,,,,,0,,0,134815,10000,2014-03-03 00:00:00,Q1 2014,1D13370546739025387B2F4,318.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1
2,0EE9337825851032864889A,81716,2007-01-05 15:00:47.090000000,HR,36,Completed,2009-12-17 00:00:00,0.28269,0.275,0.24,,,,,,,0,GA,Other,Not available,,False,True,783C3371218786870A73D20,2007-01-02 14:09:10.060000000,480.0,499.0,2002-07-27 00:00:00,,,3.0,0,0.0,0.0,1.0,1.0,,0.0,0.0,,,,,,,,0.06,Not displayed,True,2083.333333,6954337960046817851BCB2,,,,,,,,,0,,86,6466,3001,2007-01-17 00:00:00,Q1 2007,5F7033715035555618FA612,123.32,4186.63,3001.0,1185.63,-24.2,0.0,0.0,0.0,0.0,1.0,0,0,0.0,41
3,0EF5356002482715299901A,658116,2012-10-22 11:02:35.010000000,,36,Current,,0.12528,0.0974,0.0874,0.0849,0.0249,0.06,6.0,A,9.0,16,GA,Skilled Labor,Employed,113.0,True,False,,2012-10-22 11:02:32,800.0,819.0,1983-02-28 00:00:00,5.0,5.0,29.0,7,115.0,0.0,1.0,4.0,10056.0,14.0,0.0,0.0,1444.0,0.04,30754.0,26.0,0.76,0.0,0.15,"$25,000-49,999",True,2875.0,A0393664465886295619C51,,,,,,,,,0,,16,77296,10000,2012-11-01 00:00:00,Q4 2012,9ADE356069835475068C6D2,321.45,5143.2,4091.09,1052.11,-108.01,0.0,0.0,0.0,0.0,1.0,0,0,0.0,158
4,0F023589499656230C5E3E2,909464,2013-09-14 18:38:39.097000000,,36,Current,,0.24614,0.2085,0.1985,0.18316,0.0925,0.09066,3.0,D,4.0,2,MN,Executive,Employed,44.0,True,False,,2013-09-14 18:38:44,680.0,699.0,2004-02-20 00:00:00,19.0,19.0,49.0,6,220.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,6193.0,0.81,695.0,39.0,0.95,2.0,0.26,"$100,000+",True,9583.333333,A180369302188889200689E,1.0,11.0,11.0,0.0,0.0,11000.0,9947.9,,0,,6,102670,15000,2013-09-20 00:00:00,Q3 2013,36CE356043264555721F06C,563.97,2819.85,1563.22,1256.63,-60.27,0.0,0.0,0.0,0.0,1.0,0,0,0.0,20


In [29]:
# check for duplicates in the dataset
sum(loan_df.duplicated())

0

In [30]:
#Check for number and percentage of Nulls

# Getting the number of missing values in each column
num_missing = loan_df.isna().sum()

# Excluding columns that contains 0 missing values
num_missing = num_missing[num_missing > 0]

# Getting the percentages of missing values and round down the result 
percent_missing = num_missing * 100 / loan_df.shape[0]
percent_missing = percent_missing.round(2) 

# Concatenating the number and perecentage of missing values 
# into one dataframe and sorting it 
pd.concat([num_missing, percent_missing], axis=1, 
          keys=['Missing Values', 'Percentage']).\
          sort_values(by="Missing Values", ascending=False)

Unnamed: 0,Missing Values,Percentage
GroupKey,100596,88.29
LoanFirstDefaultedCycleNumber,96985,85.12
ScorexChangeAtTimeOfListing,95009,83.39
ProsperPrincipalOutstanding,91852,80.62
ProsperPrincipalBorrowed,91852,80.62
ProsperPaymentsOneMonthPlusLate,91852,80.62
ProsperPaymentsLessThanOneMonthLate,91852,80.62
OnTimeProsperPayments,91852,80.62
TotalProsperPaymentsBilled,91852,80.62
TotalProsperLoans,91852,80.62


### What is the structure of your dataset?

> Your answer here!

### What is/are the main feature(s) of interest in your dataset?

> Your answer here!

### What features in the dataset do you think will help support your investigation into your feature(s) of interest?

> Your answer here!

## Univariate Exploration

> In this section, investigate distributions of individual variables. If
you see unusual points or outliers, take a deeper look to clean things up
and prepare yourself to look at relationships between variables.

> Make sure that, after every plot or related series of plots, that you
include a Markdown cell with comments about what you observed, and what
you plan on investigating next.

### Discuss the distribution(s) of your variable(s) of interest. Were there any unusual points? Did you need to perform any transformations?

> Your answer here!

### Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?

> Your answer here!

## Bivariate Exploration

> In this section, investigate relationships between pairs of variables in your
data. Make sure the variables that you cover here have been introduced in some
fashion in the previous section (univariate exploration).

### Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?

> Your answer here!

### Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?

> Your answer here!

## Multivariate Exploration

> Create plots of three or more variables to investigate your data even
further. Make sure that your investigations are justified, and follow from
your work in the previous sections.

### Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?

> Your answer here!

### Were there any interesting or surprising interactions between features?

> Your answer here!

> At the end of your report, make sure that you export the notebook as an
html file from the `File > Download as... > HTML` menu. Make sure you keep
track of where the exported file goes, so you can put it in the same folder
as this notebook for project submission. Also, make sure you remove all of
the quote-formatted guide notes like this one before you finish your report!

In [13]:
x = (1, 2, (3, 'John', 4), 'Hi')

In [14]:
x[2][2]

4

In [15]:
x[2][-1]

4

In [16]:
x[-1][-1]

'i'

In [17]:
x[-1][2]

IndexError: string index out of range

In [18]:
x[0:1]

(1,)

In [19]:
x[0:-1]

(1, 2, (3, 'John', 4))

In [20]:
len(x)

4

In [21]:
2 in x

True

In [22]:
3 in x

False

In [23]:
x[0] = 8

TypeError: 'tuple' object does not support item assignment

In [24]:
x = [1, 2, [3, 'John', 4], 'Hi'] 

In [25]:
x[0]

1

In [26]:
x[0:1]

[1]

In [27]:
2 in x

True

In [28]:
3 in x

False

In [29]:
x[0] = 8

x

[8, 2, [3, 'John', 4], 'Hi']

In [30]:
listA = [1, 4, 3, 0]
listB = ['x', 'z', 't', 'q']

In [31]:
listA.sort

<function list.sort>

In [32]:
listA.sort()

In [33]:
listA

[0, 1, 3, 4]

In [34]:
listA.insert(0, 100)

In [35]:
listA.remove(3)

In [36]:
listA.append(7)

In [37]:
listA

[100, 0, 1, 4, 7]

In [38]:
listA + listB

[100, 0, 1, 4, 7, 'x', 'z', 't', 'q']

In [39]:
listB.sort()
listB.pop()

'z'

In [40]:
listB.count('a')

0

In [41]:
listB.remove('a')

ValueError: list.remove(x): x not in list

In [42]:
listA.extend([4, 1, 6, 3, 4])

In [43]:
listA.count(4)

3

In [44]:
listA.index(1)

2

In [45]:
listA.pop(4)

7

In [46]:
listA.reverse()

In [47]:
listA

[4, 3, 6, 1, 4, 4, 1, 0, 100]

In [48]:
aList = [0, 1, 2, 3, 4, 5]
bList = aList
aList[2] = 'hello'
aList == bList

True

In [49]:
aList is bList

True

In [50]:
aList

[0, 1, 'hello', 3, 4, 5]

In [51]:
bList

[0, 1, 'hello', 3, 4, 5]

In [52]:
cList = [6, 5, 4, 3, 2]
dList = []
for num in cList:
    dList.append(num)
cList == dList

True

In [53]:
cList is dList

False

In [54]:
cList[2] = 20
cList

[6, 5, 20, 3, 2]

In [55]:
dList

[6, 5, 4, 3, 2]

In [58]:
def applyEachTo(L, x):
    result = []
    for i in range(len(L)):
        result.append(L[i](x))
    return result

In [59]:
def square(a):
    return a*a

def halve(a):
    return a/2

def inc(a):
    return a+1

In [60]:
applyEachTo([inc, square, halve, abs], -3)

[-2, 9, -1.5, 3]

In [61]:
applyEachTo([inc, square, halve, abs], 3.0)

[4.0, 9.0, 1.5, 3.0]

In [62]:
applyEachTo([inc, max, int], -3)

TypeError: 'int' object is not iterable

In [63]:
animals = {'a': 'aardvark', 'b': 'baboon', 'c': 'coati'}

animals['d'] = 'donkey'

In [64]:
animals

{'a': 'aardvark', 'b': 'baboon', 'c': 'coati', 'd': 'donkey'}

In [65]:
animals['c']

'coati'

In [66]:
animals['donkey']

KeyError: 'donkey'

In [67]:
len(animals)

4

In [68]:
animals['a'] = 'anteater'
animals['a']

'anteater'

In [69]:
len(animals['a'])

8

In [70]:
'baboon' in animals

False

In [71]:
'donkey' in animals.values()

True

In [72]:
'b' in animals

True

In [73]:
animals.keys()

dict_keys(['a', 'b', 'c', 'd'])

In [74]:
del animals['b']
len(animals)

3

In [75]:
animals.values()

dict_values(['anteater', 'coati', 'donkey'])

In [76]:
dict_values(['anteater', 'coati', 'donkey'])

NameError: name 'dict_values' is not defined

In [77]:
animals.values()

dict_values(['anteater', 'coati', 'donkey'])

In [31]:
my_dict = {'a':[0, 1, 2, 3], 'b':[0, 1, 2, 3], 'c':[0, 1, 2, 3], 'd':[0, 1, 2, 3]}
i = 0
output = []
for key in my_dict:
    output.append(my_dict[key][i])
    i += 1
print(output)

[0, 1, 2, 3]
