# Classifier model for personal spendings

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Set Env variables

In [2]:
from pathlib import Path
root_dir = Path('.')

In [3]:
test_size = 0.3

## Load dataset

In [4]:
from data_loader import load_treated_dataset
from feature_engineering import create_new_features

complete_dataset = load_treated_dataset(root_dir)
complete_dataset = create_new_features(complete_dataset)

## Split test and train

In [6]:
from training import split_train_test
train, test = split_train_test(complete_dataset, test_size=.3)

## Pre-process

In [77]:
from sklearn import preprocessing

In [78]:
NUMERICAL_FEATURE = ["Valor", "day", "month"]
CATEGORICAL_FEATURE = ['pix', 'uber', 'ifd', 'pag', 'pg', 'aplicação',
       'salário', 'light']
TEXT_FEATURE = "Descrição"
TARGET = "categoria"
FEATURES = [TEXT_FEATURE, *NUMERICAL_FEATURE, *CATEGORICAL_FEATURE]

### Standardize numerical columns

In [None]:
# scaler = preprocessing.StandardScaler().fit(pd.DataFrame(X_train["Valor"]))

# X_train_standardized = scaler.transform(pd.DataFrame(X_train["Valor"])) 

In [None]:
# X_test_standardized =  scaler.transform(pd.DataFrame(X_test["Valor"])) 

In [None]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    Normalizer,
)

def preprocess_number():
    return make_pipeline(
        # SimpleImputer(strategy="median"),
        StandardScaler(),
    )

def preprocess_categories():
    return make_pipeline(
       Normalizer(),
    )

def create_preprocessor():
    transformers = [
        ("num_preprocessor", preprocess_number(), [*NUMERICAL_FEATURE]),
        ("cat_preprocessor", preprocess_categories(), [*CATEGORICAL_FEATURE]),
    ]

    return ColumnTransformer(transformers=transformers, remainder="drop")

# numeric_transformer = create_preprocessor()
# numeric_transformer.set_output(transform="pandas")
# preprocessed_num_cat_features_df = numeric_transformer.fit_transform(
#     X_train[[*NUMERICAL_FEATURE, *CATEGORICAL_FEATURE]]
# )

## Tokenize description column

In [None]:
# tfidf_vectorizer = TfidfVectorizer(use_idf=True)
# X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(pd.DataFrame(X_train['Descrição'])) 


# # X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(pd.DataFrame(X_test['Descrição'])) 

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"

def tokenized_pytorch_tensors(
        df: pd.DataFrame,
        column_list: list
    ) -> Dataset:

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    transformers_dataset = Dataset.from_pandas(df)

    def tokenize(model_inputs_batch: Dataset) -> Dataset:
        return tokenizer(
            model_inputs_batch[TEXT_FEATURE],
            padding=True,
            max_length=120,
            truncation=True,
        )

    tokenized_dataset = transformers_dataset.map(
        tokenize,
        batched=True,
        batch_size=128
    )

    tokenized_dataset.set_format(
        "torch",
        columns=column_list
    )
    
    columns_to_remove = set(tokenized_dataset.column_names) - set(column_list)

    tokenized_dataset = tokenized_dataset.remove_columns(list(columns_to_remove))

    return tokenized_dataset

print("Tokenize text in Dataset of Pytorch tensors")
X_train[TEXT_FEATURE] = X_train[TEXT_FEATURE].fillna("")
tokenized_df = tokenized_pytorch_tensors(
    X_train[[TEXT_FEATURE]],
    column_list=["input_ids", "attention_mask"]
)

Tokenize text in Dataset of Pytorch tensors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[TEXT_FEATURE] = X_train[TEXT_FEATURE].fillna("")
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 686/686 [00:00<00:00, 11879.58 examples/s]


In [None]:
import torch
from transformers import AutoModel

def hidden_state_from_text_inputs(df) -> pd.DataFrame:

    def extract_hidden_states(batch):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModel.from_pretrained(MODEL_NAME)

        inputs = {
            k: v.to(device)
            for k, v in batch.items()
            if k in tokenizer.model_input_names
        }

        with torch.no_grad():
            last_hidden_state = model(**inputs).last_hidden_state
            # get the CLS token, which is the first one
            # [:, 0] gives us a row for each batch with the first column of 768 for each
            return {"cls_hidden_state": last_hidden_state[:, 0].cpu().numpy()}

    cls_dataset = df.map(extract_hidden_states, batched=True, batch_size=128)
    cls_dataset.set_format(type="pandas")

    return pd.DataFrame(
        cls_dataset["cls_hidden_state"].to_list(),
        columns=[f"feature_{n}" for n in range(1, 769)],
    )

print("Extract text feature hidden state")
hidden_states_df = hidden_state_from_text_inputs(tokenized_df)
print(f"Data with hidden state shape: {hidden_states_df.shape}") 

Extract text feature hidden state


Map:   0%|          | 0/686 [00:00<?, ? examples/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Map: 100%|██████████| 686/686 [00:20<00:00, 33.62 examples/s]

Data with hidden state shape: (686, 768)





In [None]:
X_train[TEXT_FEATURE] = X_train[TEXT_FEATURE].fillna("")
tokenized_df = tokenized_pytorch_tensors(
    X_train[[TEXT_FEATURE]],
    column_list=["input_ids", "attention_mask"]
)
print("Extract text feature hidden state")
hidden_states_df = hidden_state_from_text_inputs(tokenized_df)
print(f"Data with hidden state shape: {hidden_states_df.shape}") 

text_transformer = Pipeline(
    steps=[
        ("encoder", LabelEncoder()),
    ]
)

## Classification encoder

In [None]:
# preprocessing
TARGET_CATEGORIES = y_train.unique().tolist()
from sklearn.preprocessing import LabelEncoder
classification_encoder = LabelEncoder().fit(TARGET_CATEGORIES)
# y_train_encoded = classification_encoder.transform(y_train)

In [None]:
classification_transformer = Pipeline(
    steps=[
        ("encoder", LabelEncoder()),
    ]
)

### Group all data 

In [None]:
# print("Saving preprocessed features and targets")

# preprocessed_data = pd.concat(
#     [
#         preprocessed_num_cat_features_df.reset_index(drop=True),
#         hidden_states_df.reset_index(drop=True),
#         pd.DataFrame(y_train_encoded).reset_index(drop=True)
#     ],
#     axis=1
# )

# preprocessed_data.rename(columns={0: TARGET}, inplace=True)

Saving preprocessed features and targets


## Preprocessing Pipeline

In [164]:
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_selector as selector


X_y_train = X_train.merge(y_train, left_index=True, right_index=True)
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric_transformer", preprocess_number, NUMERICAL_FEATURE),
        ("text_transformer", text_transformer, TEXT_FEATURE),
        ("cat_preprocessor", preprocess_categories, CATEGORICAL_FEATURE),
        ("classification_transformer", classification_transformer, TARGET),
    ]
)
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
clf

NameError: name 'text_transformer' is not defined

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create the model
gbm = GradientBoostingClassifier(
    n_estimators=100,    # Number of boosting stages
    learning_rate=0.1,   # Step size shrinkage
    max_depth=3,         # Maximum depth of each tree
    random_state=42,
)

# Fit the model
gbm.fit(preprocessed_data[[i for i in preprocessed_data if i!=TARGET]].fillna(0), preprocessed_data[TARGET])

In [143]:
X_test[TEXT_FEATURE] = X_test[TEXT_FEATURE].fillna("")
tokenized_df = tokenized_pytorch_tensors(
    X_test[[TEXT_FEATURE]],
    column_list=["input_ids", "attention_mask"]
)
preprocessed_num_cat_features_df = column_transformer.fit_transform(
    X_test[[*NUMERICAL_FEATURE, *CATEGORICAL_FEATURE]]
)
hidden_states_df = hidden_state_from_text_inputs(tokenized_df)
y_test_encoded = classification_encoder.transform(y_test)

preprocessed_data = pd.concat(
    [
        preprocessed_num_cat_features_df.reset_index(drop=True),
        hidden_states_df.reset_index(drop=True),
        pd.DataFrame(y_test_encoded).reset_index(drop=True)
    ],
    axis=1
)

preprocessed_data.rename(columns={0: TARGET}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[TEXT_FEATURE] = X_test[TEXT_FEATURE].fillna("")
Map: 100%|██████████| 172/172 [00:00<00:00, 8309.38 examples/s]
Map: 100%|██████████| 172/172 [00:04<00:00, 42.24 examples/s]


In [146]:
# Predict
y_pred = gbm.predict(preprocessed_data[[i for i in preprocessed_data if i!=TARGET]])
y_test = preprocessed_data[TARGET]

In [147]:
# Evaluate
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6104651162790697


In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix

# multilabel_confusion_matrix(y_test, y_pred)
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [148]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

## Train Model

In [None]:
from sklearn.linear_model import RidgeClassifier
X_train = preprocessed_data[[i for i in preprocessed_data if i!=TARGET]].fillna(0)
y_train = preprocessed_data[TARGET]


clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [None]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_test_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
#Pre-processing the new dataset
df_test['clean_text'] = df_test['text'].apply(lambda x: finalpreprocess(x)) #preprocess the data
X_test=df_test['clean_text'] 
#converting words to numerical data using tf-idf
X_vector=tfidf_vectorizer.transform(X_test)
#use the best model to predict 'target' value for the new dataset 
y_predict = lr_tfidf.predict(X_vector)      
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]
df_test['predict_prob']= y_prob
df_test['target']= y_predict
final=df_test[['clean_text','target']].reset_index(drop=True)
print(final.head())

In [None]:
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

# Test