In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import re
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

from ydata_profiling import ProfileReport

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'

# your code that triggers the warning goes here

pd.options.mode.chained_assignment = 'warn'  # set it back to the default value

In [None]:
 
# Specifying data types for columns while reading a CSV file
dtype_dict = {
    "number": "str",
    "closed": "str", 
    "case": "str",
    "description": "str",
    "case_type": "category",
    "due_date": "str", 
    "first_response_time": "str", 
    "opened": "str",  
    "account": "category",
    "contact": "category",
    "created_by": "category",
    "business_service": "category",
    "business_service_activity": "category",
    "assigned_to": "category",
    "assignment_group": "category",
    "auto_close": "category",
    "time_worked": "float",
    "reassignment_count": "int",
    "impact": "category",
    "priority": "category",
    "urgency": "category",
    "escalation": "category",
    "comments": "str",
    "case_cause": "category",
    "cause": "str",
    "close_notes": "str",
    "resolution_code": "category",
    "problem": "category",
    "business_percentage": "float",
    "sla_has_breached": "category",
    "duration": "float",
    "openedToClosed": "float",
    "created_by_group": "category"
}

# read csv file into dataframe
df = pd.read_csv('Data/data_new.csv', dtype = dtype_dict)

# print shape of dataframe 
print(df.shape)
print(df.columns)
display(df)

In [None]:
# profile = ProfileReport(df, title="Profiling Report")
# profile.to_notebook_iframe()
# profile.to_file(f"./html/Profiling Report Overall.html")

In [None]:
(df["business_service"].value_counts() / len(df) * 100).head(10).plot(kind = "barh")
# add procent value to bar
for index, value in enumerate((df["business_service"].value_counts() / len(df) * 100).head(10)):
    plt.text(value, index, str(round(value, 2)) + '%')
plt.title("Top 10 Business Service")
plt.xlabel("Prozent")
plt.ylabel("Business Service")
plt.show()

In [None]:
# displau Short Description (case), Description, Cause and close_notes
df_text = df[['description', 'cause', 'close_notes']]

# preprocess df_text columns 

display(df_text)

In [None]:
def process_captions(data, column):
    data[column] = data[column].apply(lambda x: x.replace('\n', ' '))
    data[column] = data[column].apply(lambda x: x.replace('-', ' '))
    data[column] = data[column].apply(lambda x: x.lower())
    data[column] = data[column].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', ' ', x))
    return data[column]

# change datatype of columns to string
df_text['description'] = df_text['description'].astype(str).copy()
df_text['cause'] = df_text['cause'].astype(str).copy()
df_text['close_notes'] = df_text['close_notes'].astype(str).copy()

df_text['description'] = process_captions(df_text, 'description').copy()
df_text['cause'] = process_captions(df_text, 'cause').copy()
df_text['close_notes'] = process_captions(df_text, 'close_notes').copy()

display(df_text)

In [None]:
import requests

stopwords = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt").text.split("\n")
# Add common words to stopwords
stopwords.extend(["nan", "frau", "herr", "ch", "bitte", "st"])
from sklearn.feature_extraction.text import TfidfVectorizer

# Create empty df to store the top 50 words for each column
top_50_words = pd.DataFrame()

for column in df_text.columns:
    tfidf = TfidfVectorizer(stop_words=stopwords)
    text = tfidf.fit_transform(df_text[column])

    VectorizedText = pd.DataFrame(text.toarray(), columns=tfidf.get_feature_names_out())
    # Add to all columns the column name
    VectorizedText.columns = [column + "_" + col for col in VectorizedText.columns]
    column_sums = VectorizedText.sum(axis=0).sort_values(ascending=False)

    # Select the names of the top 50 columns with the highest sums
    top_50_columns = column_sums.head(25).index
    top_50_words[column] = top_50_columns

    # Subset the original DataFrame to keep only these top 50 columns
    minimized_df = VectorizedText[top_50_columns]

    # save minimized_df to csv
    minimized_df.to_csv('Data/VectorizedText_' + column + '.csv', index=False)