In [9]:
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("Tobi-Bueck/customer-support-tickets")

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib 

In [11]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['subject', 'body', 'answer', 'type', 'queue', 'priority', 'language', 'version', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6', 'tag_7', 'tag_8'],
        num_rows: 61765
    })
})


In [12]:
df = ds['train'].to_pandas()
df.info

<bound method DataFrame.info of                                                  subject  \
0                        Wesentlicher Sicherheitsvorfall   
1                                     Account Disruption   
2      Query About Smart Home System Integration Feat...   
3                      Inquiry Regarding Invoice Details   
4      Question About Marketing Agency Software Compa...   
...                                                  ...   
61760     Assistance Needed for IFTTT Docker Integration   
61761        Bitten um Unterstützung bei der Integration   
61762                                               None   
61763            Hilfe bei digitalen Strategie-Problemen   
61764  Optimierung Ihrer Datenanalyse-Plattform erlei...   

                                                    body  \
0      Sehr geehrtes Support-Team,\n\nich möchte eine...   
1      Dear Customer Support Team,\n\nI am writing to...   
2      Dear Customer Support Team,\n\nI hope this mes...   
3      

In [13]:
df_en = df[df['language'] == 'en'].copy()

In [14]:
df_en['queue'].value_counts()

queue
Technical Support                  8149
Product Support                    5305
Customer Service                   4269
IT Support                         3333
Billing and Payments               2897
Returns and Exchanges              1402
Service Outages and Maintenance    1106
Sales and Pre-Sales                 843
Human Resources                     553
General Inquiry                     404
Name: count, dtype: int64

In [15]:
top_label=df_en['queue'].value_counts().head(10).index.tolist()
# print(top_label)

df_en['label'] = np.where(df_en['queue'].isin(top_label), df_en['queue'], 'other')

In [16]:
df_en['label'].value_counts()

label
Technical Support                  8149
Product Support                    5305
Customer Service                   4269
IT Support                         3333
Billing and Payments               2897
Returns and Exchanges              1402
Service Outages and Maintenance    1106
Sales and Pre-Sales                 843
Human Resources                     553
General Inquiry                     404
Name: count, dtype: int64

In [17]:
print(len(df_en))

28261


In [18]:
train, temp = train_test_split(df_en, test_size=0.2, stratify=df_en['label'], random_state=42)
valid, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)

print(f"Train rows: {len(train)}")
print(f"Valid rows: {len(valid)}")
print(f"Test rows:  {len(test)}")

Train rows: 22608
Valid rows: 2826
Test rows:  2827


In [26]:
test['text'] = test['subject'].fillna('')+ '\n' + test['body'].fillna('')
valid['text'] = valid['subject'].fillna('')+ '\n' + valid['body'].fillna('')
train['text'] = train['subject'].fillna('')+ '\n' + train['body'].fillna('')

x_train = train['text']
y_train = train['label']

x_val = valid['text']
y_val = valid['label']

x_test = test['text']
y_test = test['label']

In [28]:
print(x_train)

44125    Inquiry on Integration Options for SaaS Projec...
7424     \nDear Customer Support, I have serious concer...
1175     Enhancement of Marketing Platforms\nCustomer S...
43876    Inquiry Regarding Digital Marketing Strategy C...
20481    Update User Interface\nRequest to update the u...
                               ...                        
15610    Support Inquiry for System Outage\nEncountered...
19565    Problem with Analytics\nThe analytics dashboar...
44659    Marketing Firm Faces Reduced Brand Engagement\...
12973    Query on Gaming Product Digital Promotion Tact...
9374     Assistance Needed with SaaS Downtime\nOur proj...
Name: text, Length: 22608, dtype: object


In [29]:
print("--- Random Samples ---")
print(train[['text', 'label']].sample(3))

short_count = (train['text'].str.len() < 20).sum()
print(f"\nNumber of very short/empty texts: {short_count}")

--- Random Samples ---
                                                    text                 label
47040  Support for Integration Failures\nHere is a co...       Product Support
13790  Concerns About Securing Medical Data in a Hosp...     Technical Support
25729  Strategies for Promoting Digital Gamepad ASUS ...  Billing and Payments

Number of very short/empty texts: 3


In [30]:
x_train.head(100)

44125    Inquiry on Integration Options for SaaS Projec...
7424     \nDear Customer Support, I have serious concer...
1175     Enhancement of Marketing Platforms\nCustomer S...
43876    Inquiry Regarding Digital Marketing Strategy C...
20481    Update User Interface\nRequest to update the u...
                               ...                        
21542    Trouble with Zoho Integration\nExperienced dif...
7489     Request for Assistance with Data Delay Issues\...
20688    Request for Notion Integration Support\nCould ...
23187    Security of Healthcare Data\nIs there any poss...
58217    Enhance Digital Growth with Microsoft Azure an...
Name: text, Length: 100, dtype: object

In [41]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(x_train)
vectorizer.get_feature_names_out()
print(X_train_tfidf.shape)

(22608, 5891)


In [None]:
# clf = LogisticRegression(class_weight='balanced',max_iter=1000, random_state=42)
# clf.fit(X_train_tfidf, y_train)
# x_lr = vectorizer.transform(x_val)
# y_pred_lr = clf.predict(x_lr)
# lr_acc = accuracy_score (y_val, y_pred_lr)
# lr_f1 = f1_score(y_val, y_pred_lr, average='macro')

# print(f"Validation Accuracy: {lr_acc:.4f}")
# print(f"Validation Macro F1: {lr_f1:.4f}")

# print("\nReport")
# print(classification_report(y_val, y_pred_lr))

In [None]:
svm = LinearSVC(class_weight='balanced', max_iter=100, random_state=42)
svm.fit(X_train_tfidf, y_train) 
x_svm = vectorizer.transform(x_val)
y_pred_svm = svm.predict(x_svm)
svm_acc = accuracy_score (y_val, y_pred_svm)
svm_f1 = f1_score(y_val, y_pred_svm, average='macro')
print(f"Validation Accuracy: {svm_acc:.4f}")
print(f"Validation Macro F1: {svm_f1:.4f}")

print("\nReport")
print(classification_report(y_val, y_pred_svm))

Validation Accuracy: 0.5223
Validation Macro F1: 0.5052

--- Detailed Report ---
                                 precision    recall  f1-score   support

           Billing and Payments       0.78      0.82      0.80       289
               Customer Service       0.52      0.44      0.48       427
                General Inquiry       0.26      0.49      0.34        41
                Human Resources       0.39      0.70      0.50        56
                     IT Support       0.45      0.48      0.46       333
                Product Support       0.52      0.44      0.47       530
          Returns and Exchanges       0.38      0.59      0.46       140
            Sales and Pre-Sales       0.30      0.55      0.39        85
Service Outages and Maintenance       0.51      0.79      0.62       110
              Technical Support       0.61      0.47      0.53       815

                       accuracy                           0.52      2826
                      macro avg       0.4

In [43]:
x_test_tfidf = vectorizer.transform(x_test)
y_test_pred = svm.predict(x_test_tfidf)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"FINAL TEST ACCURACY: {test_accuracy:.4f}")
print(f"FINAL TEST MACRO F1: {test_f1:.4f}")

print("\n--- Final Test Classification Report ---")
print(classification_report(y_test, y_test_pred))

FINAL TEST ACCURACY: 0.5239
FINAL TEST MACRO F1: 0.5264

--- Final Test Classification Report ---
                                 precision    recall  f1-score   support

           Billing and Payments       0.76      0.83      0.80       290
               Customer Service       0.46      0.38      0.42       427
                General Inquiry       0.33      0.78      0.46        40
                Human Resources       0.43      0.80      0.56        55
                     IT Support       0.42      0.46      0.44       334
                Product Support       0.53      0.40      0.45       531
          Returns and Exchanges       0.38      0.59      0.47       140
            Sales and Pre-Sales       0.40      0.71      0.51        84
Service Outages and Maintenance       0.51      0.75      0.60       111
              Technical Support       0.61      0.50      0.55       815

                       accuracy                           0.52      2827
                      ma

In [58]:
joblib.dump(vectorizer,'../models/SVM/vectorizer.pkl')
joblib.dump(svm, '../models/SVM/svm.pkl')

['../models/SVM/svm.pkl']