# Objective
Transaction categorization for credit card data using ngrams.
Dataset obtained from https://data.gov.uk/search?q=credit+card+transactions

In [1]:
import pandas as pd
import nltk

from nltk.util import ngrams

#### Input parameters ##############
input_path = "./data/1718Pcard.csv"
####################################

# Load Data

In [2]:
data = pd.read_csv(input_path)
data

Unnamed: 0,FIN.TRANSACTION DATE,FIN.POSTING DATE,FIN.TRANSACTION AMOUNT,MCH.MERCHANT NAME,MCH.CITY NAME,FIN.ORIGINAL CURRENCY AMOUNT,FIN.ORIGINAL ISO CURRENCY CODE SYMBOL,FIN.INET CONVERSION,target
0,06-04-17,07-04-17,36.55,TESCO STORE 2296,COLNEY HATCH,36.55,GBP,1.0,shopping
1,06-04-17,07-04-17,58.75,AMFBOWLING.CO.UK,01442 840200,58.75,GBP,1.0,entertainment
2,10-04-17,11-04-17,40.5,WWW.GOJUMPIN.COM,INTERNET,40.5,GBP,1.0,kids
3,12-04-17,13-04-17,23.9,AMFBOWLING.CO.UK,01442 840200,23.9,GBP,1.0,entertainment
4,12-04-17,13-04-17,24.28,VUE BSL LTD,LONDON,24.28,GBP,1.0,general
5,12-04-17,13-04-17,93.92,B & M RETAIL,TOTTENHAM,93.92,GBP,1.0,shopping
6,15-04-17,17-04-17,58,WILKO.COM,0845 6080807,58,GBP,1.0,shopping
7,18-04-17,19-04-17,12,TESCO STORES 2296,COLNEY HATCH,12,GBP,1.0,shopping
8,18-04-17,19-04-17,11.5,ASDA SUPERSTORE,S'GATE CIRCUS,11.5,GBP,1.0,shopping
9,18-04-17,19-04-17,20.72,ASDA SUPERSTORE,S'GATE CIRCUS,20.72,GBP,1.0,shopping


# Pre-process data

In [7]:
# Retain only rows where target is not null
data_target = data[~data['target'].isnull()]

# Convert column names to lower case
data_target.columns = [str(x).lower().replace(' ', '_') for x in data_target.columns]

# Extract relevant columns
data_target = data_target[['mch.merchant_name', 'target']]

# Convert '.' to ' '
data_target['mch.merchant_name_split'] = data_target.apply(lambda row: row['mch.merchant_name'].replace('.', ' ').split(), axis=1)
data_target

Unnamed: 0,mch.merchant_name,target,mch.merchant_name_split
0,TESCO STORE 2296,shopping,"[TESCO, STORE, 2296]"
1,AMFBOWLING.CO.UK,entertainment,"[AMFBOWLING, CO, UK]"
2,WWW.GOJUMPIN.COM,kids,"[WWW, GOJUMPIN, COM]"
3,AMFBOWLING.CO.UK,entertainment,"[AMFBOWLING, CO, UK]"
4,VUE BSL LTD,general,"[VUE, BSL, LTD]"
5,B & M RETAIL,shopping,"[B, &, M, RETAIL]"
6,WILKO.COM,shopping,"[WILKO, COM]"
7,TESCO STORES 2296,shopping,"[TESCO, STORES, 2296]"
8,ASDA SUPERSTORE,shopping,"[ASDA, SUPERSTORE]"
9,ASDA SUPERSTORE,shopping,"[ASDA, SUPERSTORE]"


In [8]:
# Construct ngrams
data_target['mch.merchant_name_tok'] = data_target.apply(lambda row: list(ngrams(row['mch.merchant_name_split'], 2)), axis=1)
data_target

Unnamed: 0,mch.merchant_name,target,mch.merchant_name_split,mch.merchant_name_tok
0,TESCO STORE 2296,shopping,"[TESCO, STORE, 2296]","[(TESCO, STORE), (STORE, 2296)]"
1,AMFBOWLING.CO.UK,entertainment,"[AMFBOWLING, CO, UK]","[(AMFBOWLING, CO), (CO, UK)]"
2,WWW.GOJUMPIN.COM,kids,"[WWW, GOJUMPIN, COM]","[(WWW, GOJUMPIN), (GOJUMPIN, COM)]"
3,AMFBOWLING.CO.UK,entertainment,"[AMFBOWLING, CO, UK]","[(AMFBOWLING, CO), (CO, UK)]"
4,VUE BSL LTD,general,"[VUE, BSL, LTD]","[(VUE, BSL), (BSL, LTD)]"
5,B & M RETAIL,shopping,"[B, &, M, RETAIL]","[(B, &), (&, M), (M, RETAIL)]"
6,WILKO.COM,shopping,"[WILKO, COM]","[(WILKO, COM)]"
7,TESCO STORES 2296,shopping,"[TESCO, STORES, 2296]","[(TESCO, STORES), (STORES, 2296)]"
8,ASDA SUPERSTORE,shopping,"[ASDA, SUPERSTORE]","[(ASDA, SUPERSTORE)]"
9,ASDA SUPERSTORE,shopping,"[ASDA, SUPERSTORE]","[(ASDA, SUPERSTORE)]"
