In [None]:
#! pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.3.1-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Using cached wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.4 kB)
Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl (30.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.4/30.4 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading smart_open-7.3.1-py3-none-any.whl (61 kB)
Using cached wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl (39

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import gensim.downloader as api
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('customer_support_ticket.csv')
df.head()

Unnamed: 0,ticket_id,ticket_text,department
0,1,Cannot login to my account after password reset,technical
1,2,The website is loading very slowly on my browser,technical
2,3,The app crashes whenever I try to upload photos,technical
3,4,Videos are not playing properly on my device,technical
4,5,The checkout process gave me an error,technical


In [3]:
print(f"Dataset shape: {df.shape}")
print("\nClass distribution:")
print(df['department'].value_counts())
print("\nSample tickets per department:")
for dept in df['department'].unique():
    sample_idx = df[df['department'] == dept].index[0]
    print(f"\n{dept.upper()}: {df.loc[sample_idx, 'ticket_text']}")

Dataset shape: (90, 3)

Class distribution:
department
technical    30
account      30
billing      30
Name: count, dtype: int64

Sample tickets per department:

TECHNICAL: Cannot login to my account after password reset

ACCOUNT: How do I update my shipping address for my order?

BILLING: My payment was charged twice for one order


In [9]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


# Lemmatizer relies of part of speech to help
def get_wordnet_pos(treebank_tag):
  '''
  Translate nltk POS to wordnet tags
  '''
  if treebank_tag.startswith('J'):
      return wordnet.ADJ
  elif treebank_tag.startswith('V'):
      return wordnet.VERB
  elif treebank_tag.startswith('N'):
      return wordnet.NOUN
  elif treebank_tag.startswith('R'):
      return wordnet.ADV
  else:
      return wordnet.NOUN


def basic_preprocess(text):
   """Basic preprocessing function for text."""
   # Convert to lowercase
   text = text.lower()
  
   # Remove special characters and numbers
   text = re.sub(r'[^a-zA-Z\s]', '', text)
  
   # Return cleaned text
   return text


def advanced_preprocess(text):
   """Advanced preprocessing with tokenization, stopword removal, and lemmatization."""
   # Basic cleaning
   text = basic_preprocess(text)
  
   # Tokenize
   tokens = nltk.word_tokenize(text)
  
   # Tag with pos
   tokens_tagged = pos_tag(tokens)
   pos_tokens = [(word[0], get_wordnet_pos(word[1])) for word in tokens_tagged]
  
   # Remove stopwords and lemmatize
   cleaned_tokens = [lemmatizer.lemmatize(token[0], token[1]) for token in pos_tokens if token[0] not in stop_words and len(token[0]) > 1]
  
   # Return cleaned tokens
   return ' '.join(cleaned_tokens)


# Apply preprocessing to the dataset
df['cleaned_text'] = df['ticket_text'].apply(basic_preprocess)
df['lemmatized_text'] = df['ticket_text'].apply(advanced_preprocess)
df['tokens'] = df['lemmatized_text'].apply(lambda x: x.split())


# Show the preprocessing results for a sample ticket
sample_idx = 1
print(f"Original: {df.loc[sample_idx, 'ticket_text']}")
print(f"Cleaned: {df.loc[sample_idx, 'cleaned_text']}")
print(f"Lemmatized: {df.loc[sample_idx, 'lemmatized_text']}")

Original: The website is loading very slowly on my browser
Cleaned: the website is loading very slowly on my browser
Lemmatized: website load slowly browser


[nltk_data] Downloading package punkt to /Users/hank/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/hank/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hank/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
   df['ticket_text'],
   df['department'],
   test_size=0.3,
   random_state=42,
   stratify=df['department']  # Ensure balanced classes in both sets
)


print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
print(f"Class distribution in training set: \n{y_train.value_counts()}")
print(f"Class distribution in testing set: \n{y_test.value_counts()}")

Training set size: 63
Testing set size: 27
Class distribution in training set: 
department
technical    21
account      21
billing      21
Name: count, dtype: int64
Class distribution in testing set: 
department
account      9
technical    9
billing      9
Name: count, dtype: int64


In [11]:
# Bag of Words vectorizer
count_vectorizer = CountVectorizer(
   preprocessor=advanced_preprocess,
   lowercase=False,  # Already done in preprocessing
   min_df=2,  # Ignore terms that appear in fewer than 2 documents
   max_df=0.95, # Ignore terms that appear in more than 95% of documents
   ngram_range=(1, 2) # Include both single words and pairs of consecutive words
)


# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
   preprocessor=advanced_preprocess,
   lowercase=False,
   min_df=2,
   max_df=0.95,
   ngram_range=(1, 2)
)


# Apply vectorizers to training data
X_train_counts = count_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


# Get feature information
count_features = count_vectorizer.get_feature_names_out()
tfidf_features = tfidf_vectorizer.get_feature_names_out()


print(f"Bag of Words features: {len(count_features)}")
print(f"TF-IDF features: {len(tfidf_features)}")
print(f"Sample BoW features: {count_features[:10]}")
print(f"Sample TF-IDF features (including bigrams): {[f for f in tfidf_features[:20] if ' ' in f][:5]}")

Bag of Words features: 57
TF-IDF features: 57
Sample BoW features: ['access' 'account' 'account password' 'account show' 'add' 'address'
 'app' 'billing' 'browser' 'change']
Sample TF-IDF features (including bigrams): ['account password', 'account show', 'get error']


In [14]:
# Train a Word2Vec model on our dataset
# Note: In a real-world scenario, you'd use a much larger corpus
# or pre-trained embeddings for better results
w2v_model = Word2Vec(
   df['tokens'],
   vector_size=100,  # Dimension of the embedding vectors
   window=5,  # Context window size
   min_count=1,  # Ignore words with fewer occurrences
   workers=4,  # Number of processors to use
   sg=1  # Skip-gram model (1) instead of CBOW (0)
)


# Function to create document vectors by averaging word vectors
def document_to_vector(tokens, model, vector_size=100):
   """Convert a document (list of tokens) to a vector using word embeddings."""
   # Initialize an empty vector
   doc_vector = np.zeros(vector_size)
  
   # Count valid tokens
   valid_token_count = 0
  
   # Sum up vectors for each token
   for token in tokens:
       if token in model.wv:
           doc_vector += model.wv[token]
           valid_token_count += 1
  
   # Average the vectors
   if valid_token_count > 0:
       doc_vector /= valid_token_count
      
   return doc_vector


def document_to_vector_pretrained(tokens, model, vector_size=300):
   """Convert a document (list of tokens) to a vector using word embeddings."""
   # Initialize an empty vector
   doc_vector = np.zeros(vector_size)
  
   # Count valid tokens
   valid_token_count = 0
  
   # Sum up vectors for each token
   for token in tokens:
       if token in model:
           doc_vector += model[token]
           valid_token_count += 1
  
   # Average the vectors
   if valid_token_count > 0:
       doc_vector /= valid_token_count
      
   return doc_vector


# Create document vectors for training and test sets
X_train_tokens = []
for text in X_train['ticket_text']:
   tokens = nltk.word_tokenize(text)
   for token in tokens:
      if token not in X_train_tokens:
         X_train_tokens.append(token)
X_test_tokens = []
for text in X_test['ticket_text']:
   tokens = nltk.word_tokenize(text)
   for token in tokens:
      if token not in X_train_tokens:
         X_train_tokens.append(token)
X_train_w2v = np.array([document_to_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([document_to_vector(tokens, w2v_model) for tokens in X_test_tokens])


print(f"Word2Vec document vectors shape: {X_train_w2v.shape}")


# Alternatively, download and use pre-trained embeddings
# This takes more time but might give better results
try:
   # Attempt to download pre-trained embeddings (if internet is available)
   pretrained_model = api.load('word2vec-google-news-300')
   print("Pre-trained model loaded successfully.")
  
   # Create vectors using pre-trained embeddings
   X_train_pretrained = np.array([document_to_vector_pretrained(tokens, pretrained_model, 300)
                                   for tokens in X_train_tokens])
   X_test_pretrained = np.array([document_to_vector_pretrained(tokens, pretrained_model, 300)
                                  for tokens in X_test_tokens])
  
   print(f"Pre-trained document vectors shape: {X_train_pretrained.shape}")
   pretrained_available = True
except Exception as e:
   print(f"Pre-trained embeddings could not be loaded: {e}")
   pretrained_available = False

KeyError: 'ticket_text'