In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("dataset-tickets-multi-lang-4-20k.csv")

# Check columns, missing values, and sample data
print(df.info())
print(df.head())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   subject   18539 non-null  object
 1   body      19998 non-null  object
 2   answer    19996 non-null  object
 3   type      20000 non-null  object
 4   queue     20000 non-null  object
 5   priority  20000 non-null  object
 6   language  20000 non-null  object
 7   tag_1     20000 non-null  object
 8   tag_2     19954 non-null  object
 9   tag_3     19905 non-null  object
 10  tag_4     18461 non-null  object
 11  tag_5     13091 non-null  object
 12  tag_6     7351 non-null   object
 13  tag_7     3928 non-null   object
 14  tag_8     1907 non-null   object
dtypes: object(15)
memory usage: 2.3+ MB
None
                                             subject  \
0  Unvorhergesehener Absturz der Datenanalyse-Pla...   
1                           Customer Support Inquiry   
2                      Data Ana

In [6]:
import re
import spacy
import pandas as pd

# Load language models (English/German)
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")

def clean_text(text, language="en"):
    # Handle missing/NaN values
    if pd.isna(text):
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Remove URLs, special characters, numbers
    text = re.sub(r'http\S+|www\S+|@\w+|[^a-zA-ZäöüßÄÖÜ ]', '', text)
    
    # Lemmatize and remove stopwords
    if language == "en":
        doc = nlp_en(text)
    else:
        doc = nlp_de(text)
    
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop]
    return " ".join(tokens)


In [7]:
# First, fill NaN values in body column
df['body'] = df['body'].fillna("")

# Now apply cleaning based on language column
df["cleaned_text"] = df.apply(lambda row: clean_text(row["body"], language=row["language"]), axis=1)

# Check the results
print("\nSample of cleaned texts:")
print(df[['body', 'cleaned_text']].head())

# Print statistics about cleaning
print("\nCleaning Statistics:")
print(f"Total rows: {len(df)}")
print(f"Rows with empty cleaned text: {(df['cleaned_text'] == '').sum()}")
print(f"Average cleaned text length: {df['cleaned_text'].str.len().mean():.2f} characters")


Sample of cleaned texts:
                                                body  \
0  Die Datenanalyse-Plattform brach unerwartet ab...   
1  Seeking information on digital strategies that...   
2  I am contacting you to request information on ...   
3  Ein Medien-Daten-Sperrverhalten trat aufgrund ...   
4  Dear Customer Support, I am reaching out to in...   

                                        cleaned_text  
0  datenanalyseplattform brechen unerwartet speic...  
1  seek information digital strategy aid brand gr...  
2  contact request information datum analytic too...  
3  mediendatensperrverhalten treten aufgrund uner...  
4  dear customer support reach inquire security p...  

Cleaning Statistics:
Total rows: 20000
Rows with empty cleaned text: 2
Average cleaned text length: 267.37 characters


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

# Combine tags into a list (ignore empty tags)
tags = df.apply(lambda row: [t for t in row[["tag_1", "tag_2","tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8"]].dropna()], axis=1)

# Binarize tags
mlb = MultiLabelBinarizer()
tag_matrix = pd.DataFrame(mlb.fit_transform(tags), columns=mlb.classes_)

# Merge with original data
df = pd.concat([df, tag_matrix], axis=1)

In [9]:
df["text"] = df["subject"] + " " + df["cleaned_text"]

In [10]:
# One-hot encode 'type' and 'queue'
df = pd.get_dummies(df, columns=["type", "queue"])

# Map priority to ordinal values
priority_map = {"low": 0, "medium": 1, "high": 2}
df["priority"] = df["priority"].map(priority_map)

In [12]:
df.shape

(20000, 1383)

In [18]:
display(df["text"])
display(df["cleaned_text"])

0        Unvorhergesehener Absturz der Datenanalyse-Pla...
1        Customer Support Inquiry seek information digi...
2        Data Analytics for Investment contact request ...
3        Krankenhaus-Dienstleistung-Problem mediendaten...
4        Security dear customer support reach inquire s...
                               ...                        
19995    Assistance Needed for IFTTT Docker Integration...
19996    Bitten um Unterstützung bei der Integration ge...
19997                                                  NaN
19998    Hilfe bei digitalen Strategie-Problemen qualit...
19999    Optimierung Ihrer Datenanalyse-Plattform erlei...
Name: text, Length: 20000, dtype: object

0        datenanalyseplattform brechen unerwartet speic...
1        seek information digital strategy aid brand gr...
2        contact request information datum analytic too...
3        mediendatensperrverhalten treten aufgrund uner...
4        dear customer support reach inquire security p...
                               ...                        
19995    face integration problem ifttt docker cause di...
19996    geehrt kundenservice integrationsunterstützung...
19997    hello customer support inquire billing option ...
19998    qualität digital strategiebearbeitungen negati...
19999    geehrt customer supportteam schreiben erkunden...
Name: cleaned_text, Length: 20000, dtype: object

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
text_embeddings = tfidf.fit_transform(df["text"])
X_text = pd.DataFrame(text_embeddings.toarray(), columns=tfidf.get_feature_names_out())

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
maintenance_tags = ["Hardware", "Crash", "Outage", "Security"]
df["requires_maintenance"] = df[maintenance_tags].any(axis=1).astype(int)

In [None]:
X = pd.concat([X_text, df[["priority", "type_Incident", "type_Request", ...]]], axis=1)
y = df["requires_maintenance"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)