# Importing Important Packages

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn import tree
from sklearn import preprocessing

In [13]:
DATA_PATH = '../data'
MODEL_PATH = '../src/models'

# Install Cleaned Dataset

In [14]:
df = pd.read_parquet(f'{DATA_PATH}/processed/data_clean.pqt')
df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,year,month,day
2,0,acc_0,TST CASA DEL RIO EXP FAIRLAWN OH,18.42,2022-09-26,FOOD_AND_BEVERAGES,2022,9,26
4,0,acc_0,BUFFALO WILD WINGS,26.47,2022-09-12,FOOD_AND_BEVERAGES,2022,9,12
6,0,acc_0,OCULUS CA,11.73,2022-04-18,GENERAL_MERCHANDISE,2022,4,18
7,0,acc_0,LOS GIRASOLES STOW OH,30.04,2022-03-09,FOOD_AND_BEVERAGES,2022,3,9
8,0,acc_0,BUZZIS LAUNDRY OH,4.16,2022-03-29,GENERAL_MERCHANDISE,2022,3,29


# Preprocess Data for Category Categorization

In [15]:
data = df[['category', 'memo', 'amount', 'year', 'month', 'day']].copy()
data.head()

Unnamed: 0,category,memo,amount,year,month,day
2,FOOD_AND_BEVERAGES,TST CASA DEL RIO EXP FAIRLAWN OH,18.42,2022,9,26
4,FOOD_AND_BEVERAGES,BUFFALO WILD WINGS,26.47,2022,9,12
6,GENERAL_MERCHANDISE,OCULUS CA,11.73,2022,4,18
7,FOOD_AND_BEVERAGES,LOS GIRASOLES STOW OH,30.04,2022,3,9
8,GENERAL_MERCHANDISE,BUZZIS LAUNDRY OH,4.16,2022,3,29


In [16]:
categories = data['category'].unique()
categories.sort()
cat_dict = dict(zip(categories, np.arange(len(categories))))
cat_dict

{'EDUCATION': 0,
 'FOOD_AND_BEVERAGES': 1,
 'GENERAL_MERCHANDISE': 2,
 'GROCERIES': 3,
 'MORTGAGE': 4,
 'OVERDRAFT': 5,
 'PETS': 6,
 'RENT': 7,
 'TRAVEL': 8}

In [17]:
data['category_label'] = data['category'].map(cat_dict)
data.head()

Unnamed: 0,category,memo,amount,year,month,day,category_label
2,FOOD_AND_BEVERAGES,TST CASA DEL RIO EXP FAIRLAWN OH,18.42,2022,9,26,1
4,FOOD_AND_BEVERAGES,BUFFALO WILD WINGS,26.47,2022,9,12,1
6,GENERAL_MERCHANDISE,OCULUS CA,11.73,2022,4,18,2
7,FOOD_AND_BEVERAGES,LOS GIRASOLES STOW OH,30.04,2022,3,9,1
8,GENERAL_MERCHANDISE,BUZZIS LAUNDRY OH,4.16,2022,3,29,2


In [18]:
X_train, X_test, y_train, y_test = train_test_split(data[['memo']], data['category_label'], test_size=0.2, random_state=707)

In [19]:
train_data = X_train.merge(y_train, left_index=True, right_index=True)

train_balanced = train_data.groupby('category_label').apply(lambda x: x.sample(50000, replace=True)).reset_index(drop=True)
train_balanced.head()

Unnamed: 0,memo,category_label
0,LOYOLA ATHLETICS IL,0
1,MSB PASCO COUNTY SCHOO LAND O LAKES FL USA,0
2,SCHOOL DISTRICT OF MA FL,0
3,LSA * LAW SCHOOL ADMSSN PA,0
4,SERRA CATHOLIC FACTS MARGUERITE ABDELMALEK,0


In [20]:
# X_train, X_valid, y_train, y_valid = train_test_split(train_balanced[['memo']], train_balanced['category_label'], test_size=0.2, random_state=707)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=707)

# Data Scaling Preprocessing and TF-IDF Vectorization

In [21]:
vectorizer = TfidfVectorizer()
vectorizer.fit(np.array(train_data['memo']))

In [22]:
X_train = vectorizer.transform(np.array(X_train['memo']))
y_train = np.array(y_train)

X_valid = vectorizer.transform(np.array(X_valid['memo']))
y_valid = np.array(y_valid)

X_test = vectorizer.transform(np.array(X_test['memo']))
y_test = np.array(y_test)

# Model 1: Logistic Regression

In [13]:
model1 = linear_model.LogisticRegression()
model1.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
def test_accuracy(model, X_train, y_train, X_valid, y_valid, X_test, y_test):
    print(f"Training Accuracy: {sum(model.predict(X_train) == y_train) / y_train.shape[0]}")
    print(f"Validation Accuracy: {sum(model.predict(X_valid) == y_valid) / y_valid.shape[0]}")
    print(f"Testing Accuracy: {sum(model.predict(X_test) == y_test) / y_test.shape[0]}")

In [15]:
test_accuracy(model1, X_train, y_train, X_valid, y_valid, X_test, y_test)

Training Accuracy: 0.976508381491829
Validation Accuracy: 0.9724493261829472
Testing Accuracy: 0.9728425395440333


# Model 2: Stochastic Gradient Descent

In [16]:
model2 = linear_model.SGDClassifier(loss='perceptron', shuffle=True, random_state=183)
model2.fit(X_train, y_train)

In [17]:
test_accuracy(model2, X_train, y_train, X_valid, y_valid, X_test, y_test)

Training Accuracy: 0.9873990585173562
Validation Accuracy: 0.978237885884047
Testing Accuracy: 0.978192896043109


# Model 3: Decision Tree Classifier

In [18]:
model3 = tree.DecisionTreeClassifier(
    max_depth=int(np.sqrt(X_train.shape[0])),
    min_samples_leaf=int(X_train.shape[0] ** (1/4))
)
model3.fit(X_train, y_train)

In [19]:
test_accuracy(model3, X_train, y_train, X_valid, y_valid, X_test, y_test)

Training Accuracy: 0.9543299590493322
Validation Accuracy: 0.9505915333942487
Testing Accuracy: 0.9484980347581815
