In [2]:
import pandas as pd
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import confusion_matrix

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

## general code for reuse

In [4]:
intents = ["greeting", "live_agent", "coe"]

#gets all files in datasets_post
dir_path = r'./datasets_post/'

# list to store files
files = []

# Iterate directory
for file_path in os.listdir(dir_path):
    # check if current file_path is a file
    if os.path.isfile(os.path.join(dir_path, file_path)):
        # add filename to list
        files.append(file_path)

models = [
    LogisticRegression(), 
    MLPClassifier(), 
    KNeighborsClassifier(), 
    SVC(),
    GaussianProcessClassifier(), 
    QuadraticDiscriminantAnalysis(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    AdaBoostClassifier(), 
    GaussianNB()
] 

for intent in intents:
    print("\n\n\n\n\nINTENT:" + intent)

    current_intent_file = intent + ".csv"

    others_files = []

    for file in files:
        if file != current_intent_file:
            others_files.append("./datasets_post/" + file)

    intent_df = pd.read_csv("./datasets_post/" + intent + ".csv", skip_blank_lines=True, encoding="latin1")

    intent_df.drop_duplicates(inplace=True)
    intent_df.dropna(inplace=True)
    intent_df = intent_df.drop(intent_df[intent_df['text'] == ''].index)
    intent_df.reset_index(inplace=True, drop=True)
    intent_df.shape

    others_df = pd.DataFrame(columns=["text", "intent"])

    for file in others_files:
        df = pd.read_csv(file, skip_blank_lines=True, encoding="latin1")
        df["intent"] = "others"
        others_df = pd.concat([others_df, df], ignore_index=True)

    others_df.drop_duplicates(inplace=True)
    others_df.dropna(inplace=True)
    others_df = others_df.drop(others_df[others_df['text'] == ''].index)
    others_df.reset_index(inplace=True, drop=True)

    # others_df = others_df.sample(n=intent_df.shape[0], ignore_index=True)
    print(intent + " rows:", intent_df.shape[0], "\tothers rows:", others_df.shape[0])

    df = pd.concat([intent_df, others_df], ignore_index=True)
    print(df["intent"].value_counts())

    #split training and testing set 
    x_train, x_test, y_train, y_test = train_test_split(df["text"], df["intent"], test_size=0.2, random_state=1)

    intent_vectorizer = TfidfVectorizer()

    x_values_list = intent_vectorizer.fit_transform(x_train).toarray()
    x_train = pd.DataFrame(x_values_list,columns = intent_vectorizer.get_feature_names_out())

    x_test_list = intent_vectorizer.transform(x_test).toarray()
    x_test = pd.DataFrame(x_test_list,columns = intent_vectorizer.get_feature_names_out())

    intent_best_model = {'model':"", "f1":0}

    for model in models:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        print('\n\n\n' + str(model))
        #show the results in a confusion matrix
        cf_matrix = confusion_matrix(y_test, y_pred)

        print(classification_report(y_test, y_pred))
        f1 = f1_score(y_test, y_pred, pos_label=intent)
        print("f1:", f1)
        if f1 > intent_best_model['f1']:
            intent_best_model['model'] = model
            intent_best_model['f1'] = f1

        print("\nconfusion matrix\n",cf_matrix)

    filename_model = './models/' + intent + '_' + str(intent_best_model['model'])[:-2] + "_" + str(intent_best_model['f1'])[2:6] + ".joblib"
    joblib.dump(model, filename_model)

    filename_vectorizer = './vectorizers/' + intent + '_' + str(intent_best_model['model'])[:-2] + "_" + str(intent_best_model['f1'])[2:6] + ".joblib"
    joblib.dump(intent_vectorizer, filename_vectorizer)

greeting rows: 534 	others rows: 5586
intent
others      5586
greeting     534
Name: count, dtype: int64



LogisticRegression()
              precision    recall  f1-score   support

    greeting       1.00      0.52      0.68       104
      others       0.96      1.00      0.98      1120

    accuracy                           0.96      1224
   macro avg       0.98      0.76      0.83      1224
weighted avg       0.96      0.96      0.95      1224

f1: 0.6835443037974683

confusion matrix
 [[  54   50]
 [   0 1120]]



MLPClassifier()
              precision    recall  f1-score   support

    greeting       0.99      1.00      1.00       104
      others       1.00      1.00      1.00      1120

    accuracy                           1.00      1224
   macro avg       1.00      1.00      1.00      1224
weighted avg       1.00      1.00      1.00      1224

f1: 0.9952153110047847

confusion matrix
 [[ 104    0]
 [   1 1119]]



KNeighborsClassifier()
              precision    recall 






QuadraticDiscriminantAnalysis()
              precision    recall  f1-score   support

    greeting       0.78      0.68      0.73       104
      others       0.97      0.98      0.98      1120

    accuracy                           0.96      1224
   macro avg       0.88      0.83      0.85      1224
weighted avg       0.95      0.96      0.96      1224

f1: 0.7282051282051282

confusion matrix
 [[  71   33]
 [  20 1100]]



DecisionTreeClassifier()
              precision    recall  f1-score   support

    greeting       1.00      0.62      0.76       104
      others       0.97      1.00      0.98      1120

    accuracy                           0.97      1224
   macro avg       0.98      0.81      0.87      1224
weighted avg       0.97      0.97      0.96      1224

f1: 0.7619047619047619

confusion matrix
 [[  64   40]
 [   0 1120]]



RandomForestClassifier()
              precision    recall  f1-score   support

    greeting       1.00      0.62      0.77       104
      ot






AdaBoostClassifier()
              precision    recall  f1-score   support

    greeting       0.76      0.95      0.85       104
      others       1.00      0.97      0.98      1120

    accuracy                           0.97      1224
   macro avg       0.88      0.96      0.91      1224
weighted avg       0.98      0.97      0.97      1224

f1: 0.8461538461538461

confusion matrix
 [[  99    5]
 [  31 1089]]



GaussianNB()
              precision    recall  f1-score   support

    greeting       0.97      0.98      0.98       104
      others       1.00      1.00      1.00      1120

    accuracy                           1.00      1224
   macro avg       0.98      0.99      0.99      1224
weighted avg       1.00      1.00      1.00      1224

f1: 0.9760765550239234

confusion matrix
 [[ 102    2]
 [   3 1117]]
live_agent rows: 628 	others rows: 5492
intent
others        5492
live_agent     628
Name: count, dtype: int64



LogisticRegression()
              precision    recall






QuadraticDiscriminantAnalysis()
              precision    recall  f1-score   support

  live_agent       0.65      1.00      0.79       131
      others       1.00      0.94      0.97      1093

    accuracy                           0.94      1224
   macro avg       0.82      0.97      0.88      1224
weighted avg       0.96      0.94      0.95      1224

f1: 0.7867867867867868

confusion matrix
 [[ 131    0]
 [  71 1022]]



DecisionTreeClassifier()
              precision    recall  f1-score   support

  live_agent       0.99      0.99      0.99       131
      others       1.00      1.00      1.00      1093

    accuracy                           1.00      1224
   macro avg       1.00      1.00      1.00      1224
weighted avg       1.00      1.00      1.00      1224

f1: 0.9923664122137404

confusion matrix
 [[ 130    1]
 [   1 1092]]



RandomForestClassifier()
              precision    recall  f1-score   support

  live_agent       0.98      0.99      0.98       131
      ot






AdaBoostClassifier()
              precision    recall  f1-score   support

  live_agent       0.99      0.99      0.99       131
      others       1.00      1.00      1.00      1093

    accuracy                           1.00      1224
   macro avg       1.00      1.00      1.00      1224
weighted avg       1.00      1.00      1.00      1224

f1: 0.9923664122137404

confusion matrix
 [[ 130    1]
 [   1 1092]]



GaussianNB()
              precision    recall  f1-score   support

  live_agent       0.73      0.97      0.83       131
      others       1.00      0.96      0.98      1093

    accuracy                           0.96      1224
   macro avg       0.86      0.96      0.90      1224
weighted avg       0.97      0.96      0.96      1224

f1: 0.8300653594771242

confusion matrix
 [[ 127    4]
 [  48 1045]]
coe rows: 413 	others rows: 5707
intent
others    5707
coe        413
Name: count, dtype: int64



LogisticRegression()
              precision    recall  f1-score   su






QuadraticDiscriminantAnalysis()
              precision    recall  f1-score   support

         coe       0.84      1.00      0.91        83
      others       1.00      0.99      0.99      1141

    accuracy                           0.99      1224
   macro avg       0.92      0.99      0.95      1224
weighted avg       0.99      0.99      0.99      1224

f1: 0.9120879120879121

confusion matrix
 [[  83    0]
 [  16 1125]]



DecisionTreeClassifier()
              precision    recall  f1-score   support

         coe       1.00      1.00      1.00        83
      others       1.00      1.00      1.00      1141

    accuracy                           1.00      1224
   macro avg       1.00      1.00      1.00      1224
weighted avg       1.00      1.00      1.00      1224

f1: 1.0

confusion matrix
 [[  83    0]
 [   0 1141]]



RandomForestClassifier()
              precision    recall  f1-score   support

         coe       1.00      1.00      1.00        83
      others       1.00






AdaBoostClassifier()
              precision    recall  f1-score   support

         coe       1.00      1.00      1.00        83
      others       1.00      1.00      1.00      1141

    accuracy                           1.00      1224
   macro avg       1.00      1.00      1.00      1224
weighted avg       1.00      1.00      1.00      1224

f1: 1.0

confusion matrix
 [[  83    0]
 [   0 1141]]



GaussianNB()
              precision    recall  f1-score   support

         coe       0.72      1.00      0.84        83
      others       1.00      0.97      0.99      1141

    accuracy                           0.97      1224
   macro avg       0.86      0.99      0.91      1224
weighted avg       0.98      0.97      0.98      1224

f1: 0.8383838383838383

confusion matrix
 [[  83    0]
 [  32 1109]]
