In [98]:
import numpy as np
import pandas as pd
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import Binarizer

In [64]:
df = pd.read_csv('A1 - data_sample.csv')

In [65]:
nlp = spacy.load('en_core_web_sm')

In [66]:
import spacy
from spacy.lang.en.examples import sentences 

nlp = spacy.load("en_core_web_sm")
doc = nlp(sentences[0])
print(doc)
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)
    break

Apple is looking at buying U.K. startup for $1 billion
Apple is looking at buying U.K. startup for $1 billion
Apple PROPN nsubj


In [67]:
sentences[0]

'Apple is looking at buying U.K. startup for $1 billion'

In [68]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [69]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'])
    return text

In [70]:
df['clean_name'] = df['plaid_dirty_name'].apply(preprocess_text)

In [71]:
df.head()

Unnamed: 0,id,original_amount,txn_date,plaid_category,plaid_dirty_name,clean_name
0,1312,3.5,2019-11-20,['Food and Drink'],PepsiCo,pepsico
1,97452,5.09,2020-11-04,"['Travel', 'Gas Stations']",FOOD SMAR 2901 W ARMIT CHICAGO IL 11/04,food smar 2901 w armit chicago il 1104
2,39582,9.39,2019-05-01,"['Food and Drink', 'Restaurants', 'Fast Food']",Chick-fil-A,chickfila
3,97707,10.2,2020-09-14,"['Travel', 'Gas Stations']",FOOD SMAR 2901 W ARMIT CHICAGO IL 09/13,food smar 2901 w armit chicago il 0913
4,90662,53.4,2020-11-06,"['Shops', 'Pets']",CHEWY.COM POS FL US XX4330,chewycom pos fl us xx4330


In [72]:
def tokenize_and_pos(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return tokens, pos_tags

In [73]:
df['tokens'], df['pos_tags'] = zip(*df['clean_name'].apply(tokenize_and_pos))

In [74]:
df.head()

Unnamed: 0,id,original_amount,txn_date,plaid_category,plaid_dirty_name,clean_name,tokens,pos_tags
0,1312,3.5,2019-11-20,['Food and Drink'],PepsiCo,pepsico,[pepsico],[PROPN]
1,97452,5.09,2020-11-04,"['Travel', 'Gas Stations']",FOOD SMAR 2901 W ARMIT CHICAGO IL 11/04,food smar 2901 w armit chicago il 1104,"[food, smar, 2901, w, armit, chicago, il, 1104]","[NOUN, PROPN, NUM, PROPN, PROPN, PROPN, PROPN,..."
2,39582,9.39,2019-05-01,"['Food and Drink', 'Restaurants', 'Fast Food']",Chick-fil-A,chickfila,[chickfila],[PROPN]
3,97707,10.2,2020-09-14,"['Travel', 'Gas Stations']",FOOD SMAR 2901 W ARMIT CHICAGO IL 09/13,food smar 2901 w armit chicago il 0913,"[food, smar, 2901, w, armit, chicago, il, 0913]","[NOUN, PROPN, NUM, PROPN, PROPN, PROPN, PROPN,..."
4,90662,53.4,2020-11-06,"['Shops', 'Pets']",CHEWY.COM POS FL US XX4330,chewycom pos fl us xx4330,"[chewycom, pos, fl, us, xx4330]","[PROPN, PROPN, ADP, PRON, PROPN]"


In [75]:
tfidf_vectorizer = TfidfVectorizer()

In [76]:
X = tfidf_vectorizer.fit_transform(df['clean_name'])

In [77]:
y = df['clean_name']

In [78]:
type(X), type(y)

(scipy.sparse._csr.csr_matrix, pandas.core.series.Series)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
X_test.shape

(729, 857)

In [83]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

In [84]:
y_pred = log_reg.predict(X_test)

In [85]:
accuracy = accuracy_score(y_test, y_pred)

In [86]:
accuracy

0.7462277091906722

In [87]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
                                                                      precision    recall  f1-score   support

                                                    aden food market       1.00      1.00      1.00        25
                                                       ally cashback       0.95      1.00      0.98        21
                       ally cashback future amount 19377  tran achdw       0.00      0.00      0.00         1
                                                             audible       1.00      1.00      1.00        21
                                                           chewy inc       0.67      1.00      0.80         4
                                                            chewycom       0.92      1.00      0.96        12
                                                    chewycom fl 0702       0.00      0.00      0.00         1
                                                            chickfil       1.00      1.00      1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
unique_merchants, counts = np.unique(y_pred, return_counts=True)
merchant_freq = dict(zip(unique_merchants, counts))

In [89]:
merchant_freq

{'aden food market': 25,
 'ally cashback': 22,
 'audible': 21,
 'chewy inc': 6,
 'chewycom': 13,
 'chickfil': 1,
 'chickfila': 143,
 'cumberland farms': 15,
 'dollar general': 20,
 'dollar tree': 15,
 'doordashchickfil wwwdoordash': 2,
 'duane reade': 34,
 'globalpok gold coins ta xbiex 0620': 1,
 'globalpok gold coins ta xbiex 0706': 1,
 'globalpok gold coins ta xbiex 1016': 3,
 'globalpok gold coins ta xbiex 1022': 3,
 'hovercom': 12,
 'jewelosco': 2,
 'marathon petro': 1,
 'marathon petro191684': 1,
 'metropolitan transportation authority': 19,
 'panda express': 13,
 'pepsico': 8,
 'point of sale debit ross store 57': 2,
 'pos debit food smar 2901 w armit chicago il 00nn09742': 25,
 'pos debit globalpok gold coin gpvgwco': 6,
 'pos debit globalpok gold coins ta xbiex': 21,
 'pos debit jewel osco 3474': 10,
 'pos debit nycdot parking meters': 40,
 'quiktrip': 12,
 'ross dress for less': 7,
 'sephora': 29,
 'sheetz': 11,
 'smoothie king': 1,
 'spothero 8443568054': 4,
 'spothero https

In [96]:
binarizer = Binarizer()
X_bin = binarizer.fit_transform(X)

<3644x857 sparse matrix of type '<class 'numpy.float64'>'
	with 10214 stored elements in Compressed Sparse Row format>

In [93]:
X_train_cnb, X_test_cnb, y_train_cnb, y_test_cnb = train_test_split(X_bin, y, test_size=0.2, random_state=42)

In [94]:
cnb = CategoricalNB()
cnb.fit(X_train_cnb, y_train_cnb)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [56]:
y_pred_cnb = cnb.predict([X_test])

ValueError: Found array with dim 3. CategoricalNB expected <= 2.

In [38]:
print('Total Unique Values:', len(merchant_freq))

Total Unique Values: 47
