In [35]:
import pandas as pd
import numpy as np
import re
import time

In [34]:
from datasketch import MinHash, MinHashLSHForest

### Preprocessing

In [10]:
data = pd.read_csv("bank_transactions.csv")

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048567 entries, 0 to 1048566
Data columns (total 9 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   TransactionID            1048567 non-null  object 
 1   CustomerID               1048567 non-null  object 
 2   CustomerDOB              1045170 non-null  object 
 3   CustGender               1047467 non-null  object 
 4   CustLocation             1048416 non-null  object 
 5   CustAccountBalance       1046198 non-null  float64
 6   TransactionDate          1048567 non-null  object 
 7   TransactionTime          1048567 non-null  int64  
 8   TransactionAmount (INR)  1048567 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 72.0+ MB


In [12]:
data.dropna(inplace=True)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041614 entries, 0 to 1048566
Data columns (total 9 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   TransactionID            1041614 non-null  object 
 1   CustomerID               1041614 non-null  object 
 2   CustomerDOB              1041614 non-null  object 
 3   CustGender               1041614 non-null  object 
 4   CustLocation             1041614 non-null  object 
 5   CustAccountBalance       1041614 non-null  float64
 6   TransactionDate          1041614 non-null  object 
 7   TransactionTime          1041614 non-null  int64  
 8   TransactionAmount (INR)  1041614 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 79.5+ MB


In [14]:
data.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,10/1/94,F,JAMSHEDPUR,17819.05,2/8/16,143207,25.0
1,T2,C2142763,4/4/57,M,JHAJJAR,2270.69,2/8/16,141858,27999.0
2,T3,C4417068,26/11/96,F,MUMBAI,17874.44,2/8/16,142712,459.0
3,T4,C5342380,14/9/73,F,MUMBAI,866503.21,2/8/16,142714,2060.0
4,T5,C9031234,24/3/88,F,NAVI MUMBAI,6714.43,2/8/16,181156,1762.5


### I don't need all the columns in this datam

In [21]:
def cleaning(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [20]:
#Remove TransactionID
data= data[data.columns[1:]]

In [29]:
data['joined_data'] = data[data.columns[1:]].apply(
    lambda x: ' '.join(x.astype(str)),
    axis=1
)
#Time that it took: 
#around 1m

In [41]:
#Clean the data that i'm going to use
myData["joined_data"] = data["joined_data"].apply(lambda x: " ".join(cleaning(x)))

In [42]:
myData.head()

0    f jamshedpur 1781905 2816 143207 250 fjamshedp...
1    m jhajjar 227069 2816 141858 279990 mjhajjar22...
2    f mumbai 1787444 2816 142712 4590 fmumbai17874...
3    f mumbai 86650321 2816 142714 20600 fmumbai866...
4    f navi mumbai 671443 2816 181156 17625 fnavi m...
Name: joined_data, dtype: object

In [17]:
#Number of Parameters
permutations = 128

#Number of Recommendations to return
num_recommendations = 5

In [43]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['joined_data']:
        tokens = cleaning(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [44]:
forest = get_forest(myData, permutations)

It took 1462.9666035175323 seconds to build forest.


In [None]:
#TODO My minHash
#TODO myMinHashLSHForest