First, I read in the [Lending Club Loan Dataset](https://www.kaggle.com/wordsforthewise/lending-club).
I separately return the header as a list of strings from the loan data.

In [3]:
import csv

def read_loans(file_path):
    loan_reader = csv.reader(open(file_path, "r"))
    x = list(loan_reader)
    header = x[0]
    loan_data = x[1:]   
    return header, loan_data

header, loan_data = read_loans('data/loan_extra-small.csv')

print("loan status: ", loan_data[0][header.index("loan_status")])
list(zip(header,loan_data[0]))[:25]

loan status:  Current


[('id', ''),
 ('member_id', ''),
 ('loan_amnt', '2500'),
 ('funded_amnt', '2500'),
 ('funded_amnt_inv', '2500'),
 ('term', '36 months'),
 ('int_rate', '13.56'),
 ('installment', '84.92'),
 ('grade', 'C'),
 ('sub_grade', 'C1'),
 ('emp_title', 'Chef'),
 ('emp_length', '10+ years'),
 ('home_ownership', 'RENT'),
 ('annual_inc', '55000'),
 ('verification_status', 'Not Verified'),
 ('issue_d', 'Dec-2018'),
 ('loan_status', 'Current'),
 ('pymnt_plan', 'n'),
 ('url', ''),
 ('desc', ''),
 ('purpose', 'debt_consolidation'),
 ('title', 'Debt consolidation'),
 ('zip_code', '109xx'),
 ('addr_state', 'NY'),
 ('dti', '18.24')]

Next, I tokenize each loan description and count the words for each. Loans that are "Current" or "Fully Paid" are categorized as 'good', while loans that are "Charged Off" or "Late" are categorized as 'bad'.

In [4]:
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict

def count_desc_words(loan_data, header):
    counts = defaultdict(Counter)
    goodloans = []
    badloans = []
    flat = []
    badflat = []
    goodc = {}
    badc = {}
    
    for row in loan_data:
        if row[16] == "Current" or row[16] == "Fully Paid":
            goodloans.append(word_tokenize(row[19]))
    for i in goodloans:
        for j in i:
            flat.append(j)

    for row in loan_data:
        if row[16] == "Charged Off" or row[16] == "Late (31-120 days)" or row[16] == "Late (16-30 days)":
            badloans.append(word_tokenize(row[19]))
    for i in badloans:
        for j in i:
            badflat.append(j)

    counts[0] = Counter(badflat)
    counts[1] = Counter(flat) 

    return counts

counts = count_desc_words(loan_data, header)
print('The top 25 \'good/bad\'-loan words are: ')
for good_word, bad_word, i in zip(counts[1].most_common(25), 
                                  counts[0].most_common(25),
                                  range(25)):
    print(i+1," (good, bad): ", 
          good_word, bad_word)


The top 25 'good/bad'-loan words are: 
1  (good, bad):  ('>', 102166) ('>', 18838)
2  (good, bad):  ('on', 55911) ('<', 10153)
3  (good, bad):  ('<', 54586) ('br', 10153)
4  (good, bad):  ('br', 54585) ('on', 10089)
5  (good, bad):  ('to', 50785) ('to', 8819)
6  (good, bad):  ('added', 47661) ('added', 8702)
7  (good, bad):  ('Borrower', 47584) ('Borrower', 8685)
8  (good, bad):  ('I', 39943) ('I', 6778)
9  (good, bad):  ('and', 31902) ('and', 5873)
10  (good, bad):  ('.', 30952) ('my', 5176)
11  (good, bad):  ('credit', 28660) ('.', 4902)
12  (good, bad):  ('my', 27339) ('credit', 4844)
13  (good, bad):  ('a', 25077) ('a', 3856)
14  (good, bad):  ('the', 19275) ('pay', 3538)
15  (good, bad):  ('pay', 19037) ('off', 3274)
16  (good, bad):  ('off', 18624) ('loan', 2992)
17  (good, bad):  ('loan', 18028) ('the', 2860)
18  (good, bad):  ('debt', 17377) ('debt', 2801)
19  (good, bad):  (',', 16079) (',', 2636)
20  (good, bad):  ('of', 14980) ('of', 2613)
21  (good, bad):  ('cards', 14459) 

Looking at the top 25 words for the good and bad categories, I observe there appear to be more similarlties in word use than differences.

Given these similarlties, I want to determine which words are exclusively used by each category of loan. In other words, I will remove the set of words in "good" that are used in "bad", and remove the set of words in "bad" that are used in "good"

In [5]:
def get_set_differences(counts, top_n):

    good_notin_bad, bad_notin_good = set(), set()
    
    badwords = []
    goodwords = []
    bad = []
    good = []
    
    for good_word, bad_word, i in zip(counts[1].most_common(50), 
                                      counts[0].most_common(50),
                                      range(50)):
        goodwords.append(good_word[0])
        badwords.append(bad_word[0])

    for word in badwords:
        if word not in goodwords:
            bad.append(word)  

    for word in goodwords:
        if word not in badwords:
            good.append(word)
            
    good_notin_bad= good
    bad_notin_good = bad
    return good_notin_bad, bad_notin_good

good_notin_bad, bad_notin_good = get_set_differences(counts, 50)
good_notin_bad, bad_notin_good

(['rate', '!', 'it', 'lower'], ['bills', 'consolidation', 'need', 'help'])

Based on this output, I can see that words with negative semantic associations are used more frequently in "bad" loans than in good.