In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import re
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

from ydata_profiling import ProfileReport

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'

# your code that triggers the warning goes here

pd.options.mode.chained_assignment = 'warn'  # set it back to the default value

In [None]:
 
# Specifying data types for columns while reading a CSV file
dtype_dict = {
    "number": "str",
    "closed": "str", 
    "case": "str",
    "description": "str",
    "case_type": "category",
    "due_date": "str", 
    "first_response_time": "str", 
    "opened": "str",  
    "account": "category",
    "contact": "category",
    "created_by": "category",
    "business_service": "category",
    "business_service_activity": "category",
    "assigned_to": "category",
    "assignment_group": "category",
    "auto_close": "category",
    "time_worked": "float",
    "reassignment_count": "int",
    "impact": "category",
    "priority": "category",
    "urgency": "category",
    "escalation": "category",
    "comments": "str",
    "case_cause": "category",
    "cause": "str",
    "close_notes": "str",
    "resolution_code": "category",
    "problem": "category",
    "business_percentage": "float",
    "sla_has_breached": "category",
    "duration": "float",
    "openedToClosed": "float",
    "created_by_group": "category"
}

# read csv file into dataframe
df = pd.read_csv('Data/data.csv', dtype = dtype_dict)

# concat short_description and description columns. Handle NaN values
df['short_description'] = df['short_description'].fillna('')
df['description'] = df['description'].fillna('')
df['description'] = df['short_description'].str.cat(df['description'], sep =" ")

# print shape of dataframe 
print(df.shape)
print(df.columns)
display(df)

In [None]:
# profile = ProfileReport(df, title="Profiling Report")
# profile.to_notebook_iframe()
# profile.to_file(f"./html/Profiling Report Overall.html")

In [None]:
# displau Short Description (case), Description, Cause and close_notes
df_text = df[['number', 'description', 'cause', 'close_notes']]

# preprocess df_text columns 

display(df_text)

In [None]:
def process_captions(data, column):
    data[column] = data[column].apply(lambda x: x.replace('\n', ' '))
    data[column] = data[column].apply(lambda x: x.replace('-', ' '))
    data[column] = data[column].apply(lambda x: x.lower())
    data[column] = data[column].apply(lambda x: re.sub('[^a-zA-ZäÄöÖüÜ\s]', ' ', x))
    return data[column]

# change datatype of columns to string
df_text['description'] = df_text['description'].astype(str).copy()
df_text['cause'] = df_text['cause'].astype(str).copy()
df_text['close_notes'] = df_text['close_notes'].astype(str).copy()

df_text['description'] = process_captions(df_text, 'description').copy()
df_text['cause'] = process_captions(df_text, 'cause').copy()
df_text['close_notes'] = process_captions(df_text, 'close_notes').copy()

display(df_text)

In [None]:
import requests

stopwords = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt").text.split("\n")
# Add common words to stopwords
stopwords.extend(["nan", "frau", "herr", "name", "ch", "bitte", "und", "die", "das", "ist", "zu", "den", "der", "es", "ein", "sie", 
                        "nicht", "von", "mit", "dem", "sich", "auf", "für", "an", "sind", "des", "wird", "dass", "im", "auch", "als", 
                        "an", "nach", "wie", "aber", "aus", "bei", "durch", "hat", "man", "noch", "einem", "über", "einer", "um", "am", 
                        "ohne", "zwischen", "so", "nur", "zum", "kann", "vor", "dieser", "bis", "habe", "wenn", "sein", "wird", "wurde", 
                        "können", "gegen", "dann", "müssen", "diese", "weil", "welche", "oder", "zwei", "eines", "mehr", "Jahre", "wieder", 
                        "keine", "hallo", "grüsse", "gruss", "vielen", "besten", "dank", "guten", "morgen", "tag", "freundlich", "liebe", 
                        "lieber", "sehr geehrte", "geehrter", "geehrte", "hi", "de", "guten tag", "beste", "herzliche", "liebe grüße", "vielen dank", 
                        "besten dank", "freundliche", "grüße", "danke", "siehe", "tel", "mail", "mehr", "à", "vous", "la", "le", "e", "en", "et", "mon", "je", 
                        "les", "pas", "que", "a", "c", "e", "compt", "est", "une", "il", "the", "at"])
from sklearn.feature_extraction.text import TfidfVectorizer

# Remove all characters after "Freundliche Grüsse" or "Beste Grüsse"
df_text['description'] = df_text['description'].apply(lambda x: x.split('freundliche gr')[0])
df_text['description'] = df_text['description'].apply(lambda x: x.split('beste gr')[0])
df_text['description'] = df_text['description'].apply(lambda x: x.split('vielen dank')[0])
df_text['description'] = df_text['description'].apply(lambda x: x.split('von:')[0])


# Create empty df to store the top 50 words for each column
top_50_words = pd.DataFrame()

for column in df_text.columns[1:]:
    tfidf = TfidfVectorizer(stop_words=stopwords)
    text = tfidf.fit_transform(df_text[column])

    VectorizedText = pd.DataFrame(text.toarray(), columns=tfidf.get_feature_names_out())
    # Add to all columns the column name
    VectorizedText.columns = [column + "_" + col for col in VectorizedText.columns]
    column_sums = VectorizedText.sum(axis=0).sort_values(ascending=False)

    # Select the names of the top 50 columns with the highest sums
    top_50_columns = column_sums.head(15).index
    top_50_words[column] = top_50_columns

    # Subset the original DataFrame to keep only these top 50 columns
    minimized_df = VectorizedText[top_50_columns]

    # Add from the original df the column number to the minimized_df at first position
    minimized_df.insert(0, 'number', df_text['number'])

    # save minimized_df to csv
    minimized_df.to_csv('Data/VectorizedText_' + column + '.csv', index=False)

In [None]:
# For topic Modelling
top_50_words = pd.DataFrame()
for column in df_text.columns[1:]:
    tfidf = TfidfVectorizer(stop_words=stopwords)
    text = tfidf.fit_transform(df_text[column])

    VectorizedText = pd.DataFrame(text.toarray(), columns=tfidf.get_feature_names_out())
    # Add to all columns the column name
    # VectorizedText.columns = [column + "_" + col for col in VectorizedText.columns]
    column_sums = VectorizedText.sum(axis=0).sort_values(ascending=False)

    # Select the names of the top 50 columns with the highest sums
    top_50_columns = column_sums.head(150).index
    top_50_words[column] = top_50_columns

    # only remain words in df_text that are in top_50_words
    df_text2 = df_text[['number', column]].copy()
    df_text2[column] = df_text2[column].apply(lambda x: ' '.join([word for word in x.split() if word in top_50_words['description'].values]))


    # save minimized_df to csv
    df_text2.to_csv('Data/TopicModel_VectorizedText_' + column + '.csv', index=False)

    # count number of empty values
    
