# Classifier model for personal spendings

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Set Env variables

In [2]:
from pathlib import Path
root_dir = Path('.')

In [3]:
test_size = 0.3

## Load dataset

In [4]:
from data_loader import load_treated_dataset

complete_dataset = load_treated_dataset(root_dir)

## Split test and train

In [5]:
from training import split_train_test
train, test = split_train_test(complete_dataset, test_size=test_size)

## Build Model Pipeline

In [None]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import LabelEncoder
from preprocess import get_preprocessing_transformer


clf = Pipeline(
    steps=[("preprocessor", get_preprocessing_transformer()), ("classifier", LogisticRegression())],
    verbose=False
)

### Encode y classifications

In [None]:
y_train =train['categoria']
y_enconder = LabelEncoder().fit(y_train)
y_train_encoded = y_enconder.transform(y_train)

### Fit Pipeline

In [None]:
X_train= train[[i for i in train.columns if i!='categoria']]

clf.fit(X_train, y_train_encoded)

# Validate Model

In [None]:
X_test = test[[i for i in train.columns if i!='categoria']]
y_test = test['categoria']
y_test_encoded = y_enconder.transform(y_test)
print("model score: %.3f" % clf.score(X_test, y_test_encoded))
clf

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create the model
gbm = GradientBoostingClassifier(
    n_estimators=100,    # Number of boosting stages
    learning_rate=0.1,   # Step size shrinkage
    max_depth=3,         # Maximum depth of each tree
    random_state=42,
)

# Fit the model
gbm.fit(preprocessed_data[[i for i in preprocessed_data if i!=TARGET]].fillna(0), preprocessed_data[TARGET])

In [143]:
X_test[TEXT_FEATURE] = X_test[TEXT_FEATURE].fillna("")
tokenized_df = tokenized_pytorch_tensors(
    X_test[[TEXT_FEATURE]],
    column_list=["input_ids", "attention_mask"]
)
preprocessed_num_cat_features_df = column_transformer.fit_transform(
    X_test[[*NUMERICAL_FEATURE, *CATEGORICAL_FEATURE]]
)
hidden_states_df = hidden_state_from_text_inputs(tokenized_df)
y_test_encoded = classification_encoder.transform(y_test)

preprocessed_data = pd.concat(
    [
        preprocessed_num_cat_features_df.reset_index(drop=True),
        hidden_states_df.reset_index(drop=True),
        pd.DataFrame(y_test_encoded).reset_index(drop=True)
    ],
    axis=1
)

preprocessed_data.rename(columns={0: TARGET}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[TEXT_FEATURE] = X_test[TEXT_FEATURE].fillna("")
Map: 100%|██████████| 172/172 [00:00<00:00, 8309.38 examples/s]
Map: 100%|██████████| 172/172 [00:04<00:00, 42.24 examples/s]


In [146]:
# Predict
y_pred = gbm.predict(preprocessed_data[[i for i in preprocessed_data if i!=TARGET]])
y_test = preprocessed_data[TARGET]

In [147]:
# Evaluate
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6104651162790697


In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix

# multilabel_confusion_matrix(y_test, y_pred)
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [148]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

## Train Model

In [None]:
from sklearn.linear_model import RidgeClassifier
X_train = preprocessed_data[[i for i in preprocessed_data if i!=TARGET]].fillna(0)
y_train = preprocessed_data[TARGET]


clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [None]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_test_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
#Pre-processing the new dataset
df_test['clean_text'] = df_test['text'].apply(lambda x: finalpreprocess(x)) #preprocess the data
X_test=df_test['clean_text'] 
#converting words to numerical data using tf-idf
X_vector=tfidf_vectorizer.transform(X_test)
#use the best model to predict 'target' value for the new dataset 
y_predict = lr_tfidf.predict(X_vector)      
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]
df_test['predict_prob']= y_prob
df_test['target']= y_predict
final=df_test[['clean_text','target']].reset_index(drop=True)
print(final.head())

In [None]:
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

# Test