# Objective
Transaction categorization for credit card data using TfidfVectorizer.

Dataset obtained from https://data.gov.uk/search?q=credit+card+transactions

In [94]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

#### Input parameters ##############
input_path = "./data/1718Pcard.csv"
####################################

# Load data

In [95]:
data = pd.read_csv(input_path)
data

Unnamed: 0,FIN.TRANSACTION DATE,FIN.POSTING DATE,FIN.TRANSACTION AMOUNT,MCH.MERCHANT NAME,MCH.CITY NAME,FIN.ORIGINAL CURRENCY AMOUNT,FIN.ORIGINAL ISO CURRENCY CODE SYMBOL,FIN.INET CONVERSION,target
0,06-04-17,07-04-17,36.55,TESCO STORE 2296,COLNEY HATCH,36.55,GBP,1.0,shopping
1,06-04-17,07-04-17,58.75,AMFBOWLING.CO.UK,01442 840200,58.75,GBP,1.0,entertainment
2,10-04-17,11-04-17,40.5,WWW.GOJUMPIN.COM,INTERNET,40.5,GBP,1.0,kids
3,12-04-17,13-04-17,23.9,AMFBOWLING.CO.UK,01442 840200,23.9,GBP,1.0,entertainment
4,12-04-17,13-04-17,24.28,VUE BSL LTD,LONDON,24.28,GBP,1.0,general
5,12-04-17,13-04-17,93.92,B & M RETAIL,TOTTENHAM,93.92,GBP,1.0,shopping
6,15-04-17,17-04-17,58,WILKO.COM,0845 6080807,58,GBP,1.0,shopping
7,18-04-17,19-04-17,12,TESCO STORES 2296,COLNEY HATCH,12,GBP,1.0,shopping
8,18-04-17,19-04-17,11.5,ASDA SUPERSTORE,S'GATE CIRCUS,11.5,GBP,1.0,shopping
9,18-04-17,19-04-17,20.72,ASDA SUPERSTORE,S'GATE CIRCUS,20.72,GBP,1.0,shopping


# Pre-process data

In [96]:
# Retain only rows where target is not null
data_target = data[~data['target'].isnull()]

# Convert column names to lower case
data_target.columns = [str(x).lower().replace(' ', '_') for x in data_target.columns]

# Extract relevant columns
data_target = data_target[['mch.merchant_name', 'target']]

# Drop duplicate rows
print("len before drop duplicates = " + str(len(data_target)))
data_target.drop_duplicates(inplace = True)
print("len after drop duplicates = " + str(len(data_target)))

# Convert '.' to ' '
data_target['mch.merchant_name_split'] = data_target.apply(lambda row: row['mch.merchant_name'].replace('.', ' '), axis=1)

data_target.reset_index(inplace=True, drop=True)
data_target

len before drop duplicates = 100
len after drop duplicates = 58


Unnamed: 0,mch.merchant_name,target,mch.merchant_name_split
0,TESCO STORE 2296,shopping,TESCO STORE 2296
1,AMFBOWLING.CO.UK,entertainment,AMFBOWLING CO UK
2,WWW.GOJUMPIN.COM,kids,WWW GOJUMPIN COM
3,VUE BSL LTD,general,VUE BSL LTD
4,B & M RETAIL,shopping,B & M RETAIL
5,WILKO.COM,shopping,WILKO COM
6,TESCO STORES 2296,shopping,TESCO STORES 2296
7,ASDA SUPERSTORE,shopping,ASDA SUPERSTORE
8,HOMEBASE LTD 024,shopping,HOMEBASE LTD 024
9,AMAZON UK RETAIL AMAZO,shopping,AMAZON UK RETAIL AMAZO


# Tfidf

In [97]:
# Max_features for TfidfVectorizer
max_features = 25 

# Feature extraction
merchant_name_vec = TfidfVectorizer(max_features=max_features).fit_transform(data_target['mch.merchant_name_split'].values)
txtFeatures = pd.DataFrame(merchant_name_vec.toarray())
txtFeatures

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.733112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.680107,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.722339,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.691539,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.687768,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.725931
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.591248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.591248,0.5485,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,0.0,0.521554,0.0


In [98]:
# Loop through the features and add it to the dataframe
cols = txtFeatures.columns # RangeIndex(start=0, stop=25, step=1)
for i in range(max_features):
    data_target['merchant_name_tfidf_' + str(i)] = txtFeatures[cols[i]]
data_target    


Unnamed: 0,mch.merchant_name,target,mch.merchant_name_split,merchant_name_tfidf_0,merchant_name_tfidf_1,merchant_name_tfidf_2,merchant_name_tfidf_3,merchant_name_tfidf_4,merchant_name_tfidf_5,merchant_name_tfidf_6,...,merchant_name_tfidf_15,merchant_name_tfidf_16,merchant_name_tfidf_17,merchant_name_tfidf_18,merchant_name_tfidf_19,merchant_name_tfidf_20,merchant_name_tfidf_21,merchant_name_tfidf_22,merchant_name_tfidf_23,merchant_name_tfidf_24
0,TESCO STORE 2296,shopping,TESCO STORE 2296,0.733112,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.680107,0.0,0.0,0.0,0.0
1,AMFBOWLING.CO.UK,entertainment,AMFBOWLING CO UK,0.0,0.0,0.0,0.0,0.0,0.722339,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.691539,0.0
2,WWW.GOJUMPIN.COM,kids,WWW GOJUMPIN COM,0.0,0.0,0.0,0.0,0.0,0.0,0.687768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.725931
3,VUE BSL LTD,general,VUE BSL LTD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,B & M RETAIL,shopping,B & M RETAIL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,WILKO.COM,shopping,WILKO COM,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,TESCO STORES 2296,shopping,TESCO STORES 2296,0.591248,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.591248,0.5485,0.0,0.0,0.0,0.0
7,ASDA SUPERSTORE,shopping,ASDA SUPERSTORE,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,HOMEBASE LTD 024,shopping,HOMEBASE LTD 024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,AMAZON UK RETAIL AMAZO,shopping,AMAZON UK RETAIL AMAZO,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,...,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,0.0,0.521554,0.0


In [99]:
# Drop the item name and just keep the encoded features
data_target.drop('mch.merchant_name', axis=1, inplace=True)
data_target.drop('mch.merchant_name_split', axis=1, inplace=True)
data_target

Unnamed: 0,target,merchant_name_tfidf_0,merchant_name_tfidf_1,merchant_name_tfidf_2,merchant_name_tfidf_3,merchant_name_tfidf_4,merchant_name_tfidf_5,merchant_name_tfidf_6,merchant_name_tfidf_7,merchant_name_tfidf_8,...,merchant_name_tfidf_15,merchant_name_tfidf_16,merchant_name_tfidf_17,merchant_name_tfidf_18,merchant_name_tfidf_19,merchant_name_tfidf_20,merchant_name_tfidf_21,merchant_name_tfidf_22,merchant_name_tfidf_23,merchant_name_tfidf_24
0,shopping,0.733112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.680107,0.0,0.0,0.0,0.0
1,entertainment,0.0,0.0,0.0,0.0,0.0,0.722339,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.691539,0.0
2,kids,0.0,0.0,0.0,0.0,0.0,0.0,0.687768,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.725931
3,general,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,shopping,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,shopping,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,shopping,0.591248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.591248,0.5485,0.0,0.0,0.0,0.0
7,shopping,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,shopping,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,shopping,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,0.0,0.521554,0.0


# Build model

In [100]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [101]:
# Convert target to categorical variable
data_target['target'], target_uniques = pd.factorize(data_target['target']) 
print('target_uniques = ' + str(target_uniques))
data_target

target_uniques = Index(['shopping', 'entertainment', 'kids', 'general', 'food', 'travel',
       'health', 'foods', 'sports'],
      dtype='object')


Unnamed: 0,target,merchant_name_tfidf_0,merchant_name_tfidf_1,merchant_name_tfidf_2,merchant_name_tfidf_3,merchant_name_tfidf_4,merchant_name_tfidf_5,merchant_name_tfidf_6,merchant_name_tfidf_7,merchant_name_tfidf_8,...,merchant_name_tfidf_15,merchant_name_tfidf_16,merchant_name_tfidf_17,merchant_name_tfidf_18,merchant_name_tfidf_19,merchant_name_tfidf_20,merchant_name_tfidf_21,merchant_name_tfidf_22,merchant_name_tfidf_23,merchant_name_tfidf_24
0,0,0.733112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.680107,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.722339,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.691539,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.687768,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.725931
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0.591248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.591248,0.5485,0.0,0.0,0.0,0.0
7,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.603316,0.0,0.0,0.0,0.0,0.0,0.0,0.521554,0.0


In [102]:
# Features
features = []
for i in range(max_features):
    features.append('merchant_name_tfidf_' + str(i))

In [103]:
# train_test split
train, test = train_test_split(data_target, test_size = 0.3)
X_train = train[features]
X_sample = test[features]
y_train = train['target']
y_sample = test['target']
print("X_train.shape = " + str(X_train.shape))
print("y_train.shape = " + str(y_train.shape))
print("X_sample.shape = " + str(X_sample.shape))
print("y_sample.shape = " + str(y_sample.shape))

X_train.shape = (40, 25)
y_train.shape = (40,)
X_sample.shape = (18, 25)
y_sample.shape = (18,)


In [104]:
# Create a random forest classifier. By convention, clf means 'classifier'
clf = RandomForestClassifier(n_jobs=2)

# Train the classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [105]:
# Apply the classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(X_sample)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 3, 0, 2, 0])

In [106]:
target_uniques

Index(['shopping', 'entertainment', 'kids', 'general', 'food', 'travel',
       'health', 'foods', 'sports'],
      dtype='object')

# Evaluate classifier

In [108]:
# Create actual english names for each predicted class
preds = target_uniques[clf.predict(X_sample)]
preds

Index(['shopping', 'shopping', 'shopping', 'shopping', 'shopping', 'shopping',
       'shopping', 'shopping', 'shopping', 'kids', 'kids', 'shopping',
       'shopping', 'kids', 'general', 'shopping', 'kids', 'shopping'],
      dtype='object')

In [109]:
# Create confusion matrix
pd.crosstab(target_uniques[y_sample], preds, rownames=['Actual'], colnames=['Predicted'])

Predicted,general,kids,shopping
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
entertainment,1,0,0
food,0,0,3
general,0,0,2
shopping,0,4,8


In [110]:
print(classification_report(target_uniques[y_sample], preds))

               precision    recall  f1-score   support

entertainment       0.00      0.00      0.00         1
         food       0.00      0.00      0.00         3
      general       0.00      0.00      0.00         2
         kids       0.00      0.00      0.00         0
     shopping       0.62      0.67      0.64        12

     accuracy                           0.44        18
    macro avg       0.12      0.13      0.13        18
 weighted avg       0.41      0.44      0.43        18



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [111]:
print(accuracy_score(target_uniques[y_sample], preds))

0.4444444444444444
