# Objective
Transaction categorization for credit card data using fastText.

Dataset obtained from https://data.gov.uk/search?q=credit+card+transactions. Take the raw dataset from above, with the labels, and format it into fastText format as below:
        
\_\_label\_\_sauce \_\_label\_\_cheese How much does potato starch affect a cheese sauce recipe?

In [32]:
import fasttext
import pandas as pd

from sklearn.model_selection import train_test_split

#### Input parameters ##################
input_path = "./data/1718Pcard.csv"
output_path_train = "./data/1718card_fasttext_train.txt"
output_path_test = "./data/1718card_fasttext_test.txt"
########################################

# Load data

In [33]:
data = pd.read_csv(input_path)
data

Unnamed: 0,FIN.TRANSACTION DATE,FIN.POSTING DATE,FIN.TRANSACTION AMOUNT,MCH.MERCHANT NAME,MCH.CITY NAME,FIN.ORIGINAL CURRENCY AMOUNT,FIN.ORIGINAL ISO CURRENCY CODE SYMBOL,FIN.INET CONVERSION,target
0,06-04-17,07-04-17,36.55,TESCO STORE 2296,COLNEY HATCH,36.55,GBP,1.0,shopping
1,06-04-17,07-04-17,58.75,AMFBOWLING.CO.UK,01442 840200,58.75,GBP,1.0,entertainment
2,10-04-17,11-04-17,40.5,WWW.GOJUMPIN.COM,INTERNET,40.5,GBP,1.0,kids
3,12-04-17,13-04-17,23.9,AMFBOWLING.CO.UK,01442 840200,23.9,GBP,1.0,entertainment
4,12-04-17,13-04-17,24.28,VUE BSL LTD,LONDON,24.28,GBP,1.0,general
5,12-04-17,13-04-17,93.92,B & M RETAIL,TOTTENHAM,93.92,GBP,1.0,shopping
6,15-04-17,17-04-17,58,WILKO.COM,0845 6080807,58,GBP,1.0,shopping
7,18-04-17,19-04-17,12,TESCO STORES 2296,COLNEY HATCH,12,GBP,1.0,shopping
8,18-04-17,19-04-17,11.5,ASDA SUPERSTORE,S'GATE CIRCUS,11.5,GBP,1.0,shopping
9,18-04-17,19-04-17,20.72,ASDA SUPERSTORE,S'GATE CIRCUS,20.72,GBP,1.0,shopping


# Pre-process data

In [34]:
# Retain only rows where target is not null
data_target = data[~data['target'].isnull()]

# Convert column names to lower case
data_target.columns = [str(x).lower().replace(' ', '_') for x in data_target.columns]

# Extract relevant columns
data_target = data_target[['mch.merchant_name', 'target']]

# Drop duplicate rows
print("len before drop duplicates = " + str(len(data_target)))
data_target.drop_duplicates(inplace=True)
print("len after drop duplicates = " + str(len(data_target)))

# Convert '.' to ' '
data_target['mch.merchant_name_split'] = data_target.apply(lambda row:
                                                           row['mch.merchant_name'].replace('.', ' ').lower(), axis=1)

# Drop column
data_target.drop('mch.merchant_name', axis=1, inplace=True)

data_target.reset_index(inplace=True, drop=True)
data_target

len before drop duplicates = 100
len after drop duplicates = 58


Unnamed: 0,target,mch.merchant_name_split
0,shopping,tesco store 2296
1,entertainment,amfbowling co uk
2,kids,www gojumpin com
3,general,vue bsl ltd
4,shopping,b & m retail
5,shopping,wilko com
6,shopping,tesco stores 2296
7,shopping,asda superstore
8,shopping,homebase ltd 024
9,shopping,amazon uk retail amazo


# Write pre-processed data to txt file

In [35]:
# Split into train and test sets
train, test = train_test_split(data_target, test_size=0.3)
print("train.shape = " + str(train.shape))
print("test.shape = " + str(test.shape))

train.shape = (40, 2)
test.shape = (18, 2)


In [36]:
# Write to file
with open(output_path_train, 'w') as out_file:
    for index, row in train.iterrows():
        out_file.write('__label__' + row['target'] + ' ' + row['mch.merchant_name_split'] + '\n')
        
with open(output_path_test, 'w') as out_file:
    for index, row in test.iterrows():
        out_file.write('__label__' + row['target'] + ' ' + row['mch.merchant_name_split'] + '\n')        

In [37]:
test

Unnamed: 0,target,mch.merchant_name_split
56,kids,parentpay e-com r
10,shopping,giffgaff com
54,kids,youngdriver
43,sports,top golf
8,shopping,homebase ltd 024
31,shopping,amazon eu
38,shopping,sainsburys 0513
14,shopping,pinks florists spires
7,shopping,asda superstore
5,shopping,wilko com


# Build the classifier

In [38]:
# Build the classifier
model = fasttext.train_supervised(input=output_path_train, lr=1.0, epoch=25)
model

<fasttext.FastText._FastText at 0x1a26668f60>

In [39]:
# Test on test data
model.test(output_path_test)

(18, 0.6666666666666666, 0.6666666666666666)

In [40]:
# See the predictions
pred = []
pred_prob = []
for index, row in test.iterrows():
    results = model.predict(row['mch.merchant_name_split'])
    pred.append(results[0][0].split('__')[2])
    pred_prob.append(results[1][0])
    
test.loc[:, 'pred'] = pred
test.loc[:, 'pred_prob'] = pred_prob
test

Unnamed: 0,target,mch.merchant_name_split,pred,pred_prob
56,kids,parentpay e-com r,shopping,0.990281
10,shopping,giffgaff com,shopping,0.94979
54,kids,youngdriver,shopping,0.990281
43,sports,top golf,sports,0.733569
8,shopping,homebase ltd 024,shopping,0.981537
31,shopping,amazon eu,shopping,0.999663
38,shopping,sainsburys 0513,shopping,0.99992
14,shopping,pinks florists spires,shopping,0.990281
7,shopping,asda superstore,shopping,0.998041
5,shopping,wilko com,shopping,0.94979
