In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("data/lending-club-loan-data/loan.csv", dtype='unicode')

In [4]:
data.shape

(887379, 74)

## DataFrame 분석

In [5]:
data.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [6]:
# 필요한 데이터만 추출
df = data[ ['loan_amnt', 'loan_status', 'grade', 'int_rate', 'term'] ]
df

Unnamed: 0,loan_amnt,loan_status,grade,int_rate,term
0,5000.0,Fully Paid,B,10.65,36 months
1,2500.0,Charged Off,C,15.27,60 months
2,2400.0,Fully Paid,C,15.96,36 months
3,10000.0,Fully Paid,C,13.49,36 months
4,3000.0,Current,B,12.69,60 months
...,...,...,...,...,...
887374,10000.0,Current,B,11.99,36 months
887375,24000.0,Current,B,11.99,36 months
887376,13000.0,Current,D,15.99,60 months
887377,12000.0,Current,E,19.99,60 months


각 데이터들이 범주형인지 확인

In [7]:
df['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Current', 'Default',
       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off', 'Issued'],
      dtype=object)

In [8]:
df['grade'].unique()

array(['B', 'C', 'A', 'E', 'F', 'D', 'G'], dtype=object)

In [9]:
df['term'].unique()

array([' 36 months', ' 60 months'], dtype=object)

In [10]:
# NaN이 있는 데이터 제거
df.dropna(how='any')

Unnamed: 0,loan_amnt,loan_status,grade,int_rate,term
0,5000.0,Fully Paid,B,10.65,36 months
1,2500.0,Charged Off,C,15.27,60 months
2,2400.0,Fully Paid,C,15.96,36 months
3,10000.0,Fully Paid,C,13.49,36 months
4,3000.0,Current,B,12.69,60 months
...,...,...,...,...,...
887374,10000.0,Current,B,11.99,36 months
887375,24000.0,Current,B,11.99,36 months
887376,13000.0,Current,D,15.99,60 months
887377,12000.0,Current,E,19.99,60 months


### Practice 1  

대출 기간(term)에 따른 대출 총액(loan_amnt)

In [11]:
term_to_loan_amnt_dict = {}
unique_terms = df['term'].unique()

In [27]:
df['loan_amnt_number'] = df.loc[ :, 'loan_amnt'].astype(float)
for term in unique_terms:
    loan_amnt_sum = df.loc[ df['term'] == term, 'loan_amnt_number'].sum()
    print(term, loan_amnt_sum)

 36 months 7752507375.0
 60 months 5341004575.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['loan_amnt_number'] = df.loc[ :, 'loan_amnt'].astype(float)


In [28]:
# term_to_loan_amnt_dict = {' 36 months': 7752507375.0, \
#                           ' 60 months': 5341004575.0}

In [30]:
term_to_loan_amnt = pd.Series(term_to_loan_amnt_dict)

In [31]:
term_to_loan_amnt_dict

{' 36 months': 7752507375.0, ' 60 months': 5341004575.0}

### Practic2  

대출상태(loan_status)가 불량인 사람들의 대출 등급(grade)

In [16]:
df.head()

Unnamed: 0,loan_amnt,loan_status,grade,int_rate,term,loan_amount_number
0,5000.0,Fully Paid,B,10.65,36 months,5000.0
1,2500.0,Charged Off,C,15.27,60 months,2500.0
2,2400.0,Fully Paid,C,15.96,36 months,2400.0
3,10000.0,Fully Paid,C,13.49,36 months,10000.0
4,3000.0,Current,B,12.69,60 months,3000.0


In [17]:
df['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Current', 'Default',
       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off', 'Issued'],
      dtype=object)

In [18]:
total_status_category = df['loan_status'].unique()

In [32]:
bad_status_category = total_status_category [ [1,3,4,5,6,8] ]

In [33]:
bad_status_category

array(['Charged Off', 'Default', 'Late (31-120 days)', 'In Grace Period',
       'Late (16-30 days)',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

In [34]:
df['bad_loan_status'] = df['loan_status'].isin(bad_status_category)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bad_loan_status'] = df['loan_status'].isin(bad_status_category)


Unnamed: 0,loan_amnt,loan_status,grade,int_rate,term,loan_amount_number,loan_amnt_number,bad_loan_status
0,5000.0,Fully Paid,B,10.65,36 months,5000.0,5000.0,False
1,2500.0,Charged Off,C,15.27,60 months,2500.0,2500.0,True
2,2400.0,Fully Paid,C,15.96,36 months,2400.0,2400.0,False
3,10000.0,Fully Paid,C,13.49,36 months,10000.0,10000.0,False
4,3000.0,Current,B,12.69,60 months,3000.0,3000.0,False
