In [1]:
import pandas as pd
import numpy as np
import re
import time

from datetime import datetime as dt

### Preprocessing

In [67]:
data = pd.read_csv("bank_transactions.csv")

Let's visualize our data

In [68]:
data.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,10/1/94,F,JAMSHEDPUR,17819.05,2/8/16,143207,25.0
1,T2,C2142763,4/4/57,M,JHAJJAR,2270.69,2/8/16,141858,27999.0
2,T3,C4417068,26/11/96,F,MUMBAI,17874.44,2/8/16,142712,459.0
3,T4,C5342380,14/9/73,F,MUMBAI,866503.21,2/8/16,142714,2060.0
4,T5,C9031234,24/3/88,F,NAVI MUMBAI,6714.43,2/8/16,181156,1762.5


Check if we have na values and drop them if needed.

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041614 entries, 0 to 1048566
Data columns (total 9 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   TransactionID            1041614 non-null  object 
 1   CustomerID               1041614 non-null  object 
 2   CustomerDOB              1041614 non-null  object 
 3   CustGender               1041614 non-null  object 
 4   CustLocation             1041614 non-null  object 
 5   CustAccountBalance       1041614 non-null  float64
 6   TransactionDate          1041614 non-null  object 
 7   TransactionTime          1041614 non-null  int64  
 8   TransactionAmount (INR)  1041614 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 79.5+ MB


In [69]:
data.dropna(inplace=True)

#### Convert the columns rappresenting time into, datetime format

In [72]:
data.TransactionTime = data.TransactionTime.apply(lambda x: dt.strptime(str(x).zfill(6), '%H%M%S').hour)

In [70]:
data.CustomerDOB = pd.to_datetime(data.CustomerDOB)

In [71]:
data.TransactionDate = pd.to_datetime(data.TransactionDate)

#### Let's deal with the offset people born after 2000, and the people born before 1800

In [73]:
data.loc[data.CustomerDOB.dt.year > 2000, 'CustomerDOB'] = data.loc[data.CustomerDOB.dt.year > 2000, 'CustomerDOB'] - pd.DateOffset(years = 100)

In [74]:
data.drop(data[data.CustomerDOB.dt.year == 1800].index, axis=0, inplace=True)

In [75]:
data.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,1994-10-01,F,JAMSHEDPUR,17819.05,2016-02-08,14,25.0
1,T2,C2142763,1957-04-04,M,JHAJJAR,2270.69,2016-02-08,14,27999.0
2,T3,C4417068,1996-11-26,F,MUMBAI,17874.44,2016-02-08,14,459.0
3,T4,C5342380,1973-09-14,F,MUMBAI,866503.21,2016-02-08,14,2060.0
4,T5,C9031234,1988-03-24,F,NAVI MUMBAI,6714.43,2016-02-08,18,1762.5


In [78]:
data.TransactionDate = data.TransactionDate.apply(lambda x: x.to_period('M'))

### I don't need all the columns in this datam

In [79]:
def cleaning(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [161]:
data2 = data.iloc[:, 1:]

In [162]:
data2.drop(['CustGender'], axis = 1, inplace=True)

In [163]:
data2.head()

Unnamed: 0,CustomerID,CustomerDOB,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,C5841053,1994-10-01,JAMSHEDPUR,17819.05,2016-02,14,25.0
1,C2142763,1957-04-04,JHAJJAR,2270.69,2016-02,14,27999.0
2,C4417068,1996-11-26,MUMBAI,17874.44,2016-02,14,459.0
3,C5342380,1973-09-14,MUMBAI,866503.21,2016-02,14,2060.0
4,C9031234,1988-03-24,NAVI MUMBAI,6714.43,2016-02,18,1762.5


## Place the numeric variable in a range of values

In [164]:
data2.rename(columns = {"TransactionAmount (INR)": "TransactionAmount_INR"}, inplace = True)

In [165]:
n1 = data2.CustAccountBalance.max()/100
n2 = data2.TransactionAmount_INR.max()/100

In [166]:
data2.head()

Unnamed: 0,CustomerID,CustomerDOB,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount_INR
0,C5841053,1994-10-01,JAMSHEDPUR,17819.05,2016-02,14,25.0
1,C2142763,1957-04-04,JHAJJAR,2270.69,2016-02,14,27999.0
2,C4417068,1996-11-26,MUMBAI,17874.44,2016-02,14,459.0
3,C5342380,1973-09-14,MUMBAI,866503.21,2016-02,14,2060.0
4,C9031234,1988-03-24,NAVI MUMBAI,6714.43,2016-02,18,1762.5


In [167]:
a = np.quantile(data2.CustAccountBalance, np.arange(0,1,0.1))
#, axis=None, out=None, overwrite_input=False, method='linear', keepdims=False, *, interpolation=None)

In [168]:
a = a.tolist()

In [169]:
a = list(map(int, a))

In [170]:
a.append(data2.CustAccountBalance.max())

In [171]:
a = list(map(int, a))

In [172]:
a.remove(0)

In [173]:
a

[478, 3013, 6303, 10447, 16006, 24977, 40429, 73611, 181250, 115035495]

In [174]:
bins = pd.qcut(data2.CustAccountBalance, 10, retbins = True, duplicates = 'drop',labels=False)


In [175]:
data2.CustAccountBalance = bins[0]

In [176]:
data2.CustAccountBalance = data2.CustAccountBalance.apply(lambda x :  a[x])

In [177]:
data2.head()

Unnamed: 0,CustomerID,CustomerDOB,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount_INR
0,C5841053,1994-10-01,JAMSHEDPUR,24977,2016-02,14,25.0
1,C2142763,1957-04-04,JHAJJAR,3013,2016-02,14,27999.0
2,C4417068,1996-11-26,MUMBAI,24977,2016-02,14,459.0
3,C5342380,1973-09-14,MUMBAI,115035495,2016-02,14,2060.0
4,C9031234,1988-03-24,NAVI MUMBAI,10447,2016-02,18,1762.5


In [178]:
bins = pd.qcut(data2.TransactionAmount_INR, 10, retbins = True, duplicates = 'drop',labels=False)


In [179]:
a = np.quantile(data2.TransactionAmount_INR, np.arange(0,1,0.1))

In [180]:
a = list(map(int, a))


In [181]:
a.remove(0)

In [182]:
a.append(data2.TransactionAmount_INR.max())

In [183]:
data2.TransactionAmount_INR = bins[0]

In [184]:
data2.TransactionAmount_INR = data2.TransactionAmount_INR.apply(lambda x :  a[x])

In [185]:
data.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,1994-10-01,F,JAMSHEDPUR,17819.05,2016-02,14,25.0
1,T2,C2142763,1957-04-04,M,JHAJJAR,2270.69,2016-02,14,27999.0
2,T3,C4417068,1996-11-26,F,MUMBAI,17874.44,2016-02,14,459.0
3,T4,C5342380,1973-09-14,F,MUMBAI,866503.21,2016-02,14,2060.0
4,T5,C9031234,1988-03-24,F,NAVI MUMBAI,6714.43,2016-02,18,1762.5


#### Find the age of each customer and put them in a range of values

In [188]:
bins = pd.qcut(data2.CustomerDOB, 4, retbins = True, duplicates = 'drop',labels=False)


In [190]:
data2.CustomerDOB = data2.CustomerDOB.apply(lambda x: 2022 - x.year)

In [191]:
data2.head()

Unnamed: 0,CustomerID,CustomerDOB,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount_INR
0,C5841053,28,JAMSHEDPUR,24977,2016-02,14,50.0
1,C2142763,65,JHAJJAR,3013,2016-02,14,1560034.99
2,C4417068,26,MUMBAI,24977,2016-02,14,600.0
3,C5342380,49,MUMBAI,115035495,2016-02,14,2751.0
4,C9031234,34,NAVI MUMBAI,10447,2016-02,18,2751.0


In [192]:
range1 = np.quantile(data2.CustomerDOB, np.arange(0,1,0.25))

In [193]:
range1 = list(map(int, range1))
range1.append(data2.CustomerDOB.max())

In [194]:
data2.CustomerDOB = bins[0]

In [195]:
data2.CustomerDOB = data2.CustomerDOB.apply(lambda x :  range1[x])

In [196]:
data2.head()

Unnamed: 0,CustomerID,CustomerDOB,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount_INR
0,C5841053,40,JAMSHEDPUR,24977,2016-02,14,50.0
1,C2142763,22,JHAJJAR,3013,2016-02,14,1560034.99
2,C4417068,40,MUMBAI,24977,2016-02,14,600.0
3,C5342380,22,MUMBAI,115035495,2016-02,14,2751.0
4,C9031234,35,NAVI MUMBAI,10447,2016-02,18,2751.0


In [198]:
data3 = data2.iloc[:,1:]

let's create a new column with the joined string of all previous columns

In [202]:
data3['joined_data'] = data3[data3.columns[:]].apply(
    lambda x: ' '.join(x.astype(str)),
    axis=1
)
#Time that it took: 
#around 4m

Let's clean the column "joined_data" and convert them into list

In [None]:
#Clean the data that i'm going to use
data3["joined_data"] = data3["joined_data"].apply(lambda x: cleaning(x)
data3.joined_data = data3.joined_data.apply(lambda x: x[:6])

#### Prepare the data to put inside Count vectorizer, that we are going to use to create the sparse matrix of zero and ones.

In [212]:
#String containing all the words inside the column "joined_string"
string_data = [" ".join(lst) for lst in data3.joined_data]

In [213]:
#importing libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import binarize


In [214]:
#Create CountVectorizer object
vectorizer = CountVectorizer(binary=True)
vectorizer.fit(string_data)

#Create the matrix based on our string data
matrix = vectorizer.transform(string_data)

#Binarize the data
x = binarize(matrix) 
matrix.astype(np.int8)

#Print the shape
matrix.shape

#take shape[0]
shape0 = matrix.shape[0]

#take shape[1]
shape1 = matrix.shape[1]

(985322, 5958)

Right now we have for columns the words and each rows rappresenting each customers, we want the opposite to we transpose the matrix

In [215]:
matrix_tran = matrix.transpose()
matrix_tran.shape

(5958, 985322)

#### Now let's create the hashed list from our joined data. We are gonna use a variation of this hashing function

h(x) = (ax + b) % y

In [400]:
def doHash(list,a,b):
    #Create a new list for the hashed value
    hashed = []

    #Parameters for hash function
    y = 7919 #Chosen at random

    #iterate over the list of values
    for value in list:

        #Create a temporary value
        temp = value

        #We can have both string or int in our list so we need to check
        #If we have a string then the value will be the sum of char in that string
        if type(value) == str:

            #create new value 
            temp = 0

            #iterate over every character
            for chara in value:
                temp += ord(chara)

        #compute the hashing function described above
        new_value = (int(temp)*a + b) % y

        #append the new value to the list
        hashed.append(new_value)

    return hashed

Create a new column for our hashed data with the previous function

In [359]:
#Set the parameters for the hashing function
a = 59  #random
b = 28  #random

#Create new column with hashed value
data3["hashed_data"] = data3.joined_data.apply(lambda x: doHash(x,a,b,y))

Now we create a function that will generate the permutations based on the width of the band that we chose

In [397]:
import random

def myPermutation(N):
    #Set a limit for our random int generator
    LIMIT = 10^20

    #Create an empty list where all the tuple will be appended
    permutationTuples = []

    for i in range(N):
        #Temporary List to create each tuple before appending it
        tempList = []

        for i in range(2):
            #Generate random number with previosly set limit
            tempList.append(random.randint(0, LIMIT))

        #Append the tuple into the permutation
        permutationTuples.append(tuple(tempList))

    return permutationTuples

Now let's create the function to build our signature matrix by using minHashing

In [2]:
def minHash(data, N, band):
    #Counter for allocation
    j = 0

    #Preallocate the matrix of inf value
    signatureMatrix = np.matrix(np.ones((N,shape0)) * np.inf)
    print(signatureMatrix.shape)

    #Create permutations
    permutationsValue = myPermutation(N)

    #Iterate over the data
    for lst in data:
        i = 0
        #Iterate over every permutation
        for a,b in permutationsValue:
            newHashedList = doHash(lst,a,b)

            #Iterate over the values in the list
            for hashVal in newHashedList:

                if (signatureMatrix[i, j] > hashVal):
                    signatureMatrix[i, j] = hashVal
            
            i += 1
        j += 1    
        

    return signatureMatrix

In [1]:
signatureMatrix = minHash(data3.hashed_data, 3, 2)

NameError: name 'minHash' is not defined

#### Old version

In [232]:
from numpy import argmax

def findSignatureLine(matrix):
    first1 = [argmax(col) for col in matrix]
    return first1

In [233]:
from sklearn.utils import shuffle

def createSignatureMatrix(niter: int, matrix):
    signature_list_matrix = []
    new_m = matrix
    for x in range(niter):
        new_m = shuffle(matrix)
        signature_list_matrix.append(findSignatureLine(new_m))
    return pd.DataFrame(signature_list_matrix)
    

In [236]:
signature = createSignatureMatrix(10, matrix)

In [239]:
signature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,985312,985313,985314,985315,985316,985317,985318,985319,985320,985321
0,23,20,23,3,2,3,2,9,1,7,...,32,20,35,16,16,20,17,7,1,13
1,5,5,17,35,20,17,23,16,46,3,...,5,48,35,7,44,9,5,47,39,17
2,35,3,17,47,3,20,22,35,9,1,...,47,41,32,38,5,3,3,41,7,13
3,15,3,34,48,20,3,32,40,17,16,...,38,7,3,17,23,7,7,2,39,2
4,1,9,15,33,3,17,17,22,38,34,...,7,38,35,2,9,48,5,7,7,45
5,7,23,9,15,20,16,22,47,5,34,...,23,2,39,17,17,23,3,9,16,50
6,15,7,3,3,16,2,17,7,3,23,...,44,38,22,23,9,3,2,15,23,1
7,3,2,7,2,15,17,3,20,16,20,...,2,9,35,7,33,3,16,3,47,47
8,1,17,48,1,34,9,23,3,16,47,...,16,2,47,49,15,2,38,20,5,33
9,13,1,16,22,35,38,5,16,42,47,...,38,33,20,3,20,39,22,22,3,32


In [237]:
def minHash(listSignature, b):
    return set(listSignature)

In [270]:
def findBind(signature, b):
    dic = {}
    len_x = signature.shape[1]
    len_y = signature.shape[0]
    for x in range(b, len_x, b):
        for y in range(b, len_y):
            mytuple = signature.iloc[(x-b):x,(y-1):y]
            mytuple = tuple(map(int, mytuple.to_numpy()))
            if str(mytuple) in dic.keys():
                dic[str(mytuple)].append(signature[x].name)
            else:
                dic[str(mytuple)] = list(signature[x].name)
    return dic

In [271]:
myDic= findBind(signature, 2)

TypeError: 'int' object is not iterable

In [272]:
signature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,985312,985313,985314,985315,985316,985317,985318,985319,985320,985321
0,23,20,23,3,2,3,2,9,1,7,...,32,20,35,16,16,20,17,7,1,13
1,5,5,17,35,20,17,23,16,46,3,...,5,48,35,7,44,9,5,47,39,17
2,35,3,17,47,3,20,22,35,9,1,...,47,41,32,38,5,3,3,41,7,13
3,15,3,34,48,20,3,32,40,17,16,...,38,7,3,17,23,7,7,2,39,2
4,1,9,15,33,3,17,17,22,38,34,...,7,38,35,2,9,48,5,7,7,45


In [273]:
from datasketch import MinHash, MinHashLSHForest


In [289]:
for text in data3.joined_data:
        print(text)
        m = MinHash(num_perm=4)
        m.update("new value".encode('utf-8'))
        print(m)
        break

['40', 'jamshedpur', '24977', '201602', '14', '500']
<datasketch.minhash.MinHash object at 0x00000265283142B0>


In [281]:
m.count()

0.0

In [291]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data.joined_data:
        m = MinHash(num_perm=perms)
        for s in text:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [292]:
forest = get_forest(data3, 10)

It took 356.11614966392517 seconds to build forest.


In [294]:
enumerate(199)

TypeError: 'int' object is not iterable

In [299]:
from random import randint
N = 100
max_val = 2^32-1
permutations = [(randint(0,max_val), randint(0,max_val)) for _ in range(N)]

In [301]:
for i,j in enumerate(permutations):
    print(i)
    print(j)
    break

0
(28, 7)


In [None]:
try1  = sum([ ord(x)*(p^i) for i,x in enumerate(77)])%m