In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("Final dataset.csv")

In [3]:
df = df.drop(['Unnamed: 0'], axis=1)

In [4]:
df = df.dropna()

In [5]:
documents = list(df.text.values)

In [6]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [7]:
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [8]:
df_positive_class = pd.read_csv("keywords_transactions.csv")
df_negative_class = pd.read_csv("keywords_bonding.csv")

In [9]:
po_values_1 = df_positive_class.values.tolist()
ne_values_1 = df_negative_class.values.tolist()

In [10]:
import itertools
merged_po = list(itertools.chain.from_iterable(po_values_1))
merged_ne = list(itertools.chain.from_iterable(ne_values_1))

In [11]:
merged_positive = ' '.join(merged_po)

In [12]:
merged_positive

'acct adjustment ATM auction bond borrow cash cd certificate charges check cheque clearing closure close collateral collection commission cd credit custody debit deposit derivative document drawdown equity exchange fee forwards freeze futures insurance interest rate debt leas lend liquidation loan maturity mortgage bank options pay payment payroll paystub portfolio products purchase agreement payment receipt redemption refinance refund reimbursement renewal repo reserve reversal security service settlement supply chain swap syndicate tax trade transaction transfer treasury turnaround warrant withdrawal delinquent default bankrupt overdraft visa income'

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
negative_words = list(set(word_tokenize(" ".join(merged_ne))))

In [15]:
output = [w for w in negative_words if not w in stopwords.words('English')]

In [16]:
output = [w for w in output if w.isalpha()]
output

['sent',
 'friends',
 'things',
 'attended',
 'smile',
 'thinking',
 'thank',
 'card',
 'going',
 'laughing',
 'flowers',
 'left',
 'birthday',
 'run',
 'note']

In [17]:
merged_negative = ' '.join(output)

In [18]:
vectorizer = TfidfVectorizer(stop_words='english')

In [19]:
X = vectorizer.fit_transform(df.text.apply(lambda x: np.str_(x)))

In [20]:
def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [21]:
df.isnull().sum()

text    0
dtype: int64

In [22]:
count_pos = 0
count_neg = 0
pos_examples = []
neg_examples = []
for i, text in enumerate(df.text.values):
    if cosine_sim(text ,merged_positive) >= cosine_sim(text, merged_negative) :
        count_pos += 1
        pos_examples.append(text)
    else:
        count_neg += 1
        neg_examples.append(text)
        
print(count_pos, count_neg)

718 194


In [23]:
#These are the Transaction based interactions 
pos_examples[:10]

[' Members came in again today and we continued with our conversation and I pulled credit, we talked about unsecured and secured debt, and we decided HELOC would work best for them, the have started a savings plan, and they have invited me to dinner with them on Thursday to do a closing. Brandon 9865',
 "Betty moved to FL about 5 yr's ago she was tired of the winters in MN, All of her children are married and she has 3 grand children, Betty invited me to come to FL and visit, she sometimes gets home sick for MN nice :0)",
 'Joint member became citizen and had name change. His previous name is Cha',
 'Xiong, new name is Ntxoov Xiong. Pa @ Lafayette has taken the copy of DL,',
 'petition for name change and his certificate of naturalization. She',
 'forward the documents to Records. Savannah x9746',
 'Mbr came in to cash their government check. her and her husband both came in. Did mention to them about the $2000.00 daily w/d. Did get information on the husband and she wanted to add her 

In [24]:
#These are the Social based interactions 
neg_examples[:10]

["Rec'd a payoff request from Bank Forward for their 2nd mortgage so I called just to check in to see how things are going. Brian was at work so I spoke with his wife Nicole. She said she wasn't exactly sure what he had in mind with them but she thought they were going to borrow a little extra and BF had better rates. I did ask her to have Brian call me if he had a chance today and let her know I would fax BF's request today.",
 'Don has been a member for over 20 years!!! WOW-He has worked for the same company for 40 years as a machinist loves what he does. Also found out that don has been married to his"lovely wife" for 20 years this year. In talking found out that Don does his primary banking at Wells Fargo-I asked simply if he was happy with how things are going there-he said "No." Shared with Don about how our products are set up and also shared about our free service EPay. He has a substantial amount in savings and I told him we could do more for him that what he is currently gett

In [25]:
import csv

outF = open("Classifited-Transaction Based Interactions.csv", "w")
for line in pos_examples:
  # write line to output file
    outF.write(line)
    outF.write("\n\n\n")
outF.close()

In [26]:
outF = open("Classifited-Social Based Interactions.csv", "w")
for line in neg_examples:
  # write line to output file
    outF.write(line)
    outF.write("\n\n\n")
outF.close()

In [27]:
vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                
tfidf = vect.fit_transform(documents)                                                                                                                                                                                                                  
pairwise_similarity = tfidf * tfidf.T

In [28]:
L = pairwise_similarity.todense()

In [29]:
from scipy.cluster import  hierarchy
from sklearn.cluster import SpectralClustering

In [30]:
threshold = 0.1
Z = hierarchy.linkage(L,"average", metric="cosine")
C = hierarchy.fcluster(Z, threshold, criterion="distance")

In [31]:
SpectralClustering(2).fit_predict(L)



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,