In [19]:
import pandas as pd
import numpy as np
import smogn
import re
import random
from smogn import smoter
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob
from textblob_nl import PatternTagger, PatternAnalyzer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
import optuna
import optuna.visualization as vis

In [1135]:
df = pd.read_csv('Data cleanedv2.csv', header = 0, sep= ';')

# Dataset cleaning and pre-processing for max. 4 chats

## Data cleaning and preprocessing

### Extracting Travel Month feature from Arrival

In [1144]:
df['Aankomst'] = pd.to_datetime(df['Aankomst'], format='%d/%m/%Y') #feature extraction

# Extracting the month and creating the 'Travel Month' column
df['Travel Month'] = df['Aankomst'].dt.month

### Dropping Features that are not used, Duplicates and Missing Values

In [1136]:
df.drop(columns=['Received Messages', 'Aankomst', 'Vertrek', 'Boekingsnummer'], inplace=True)

In [1137]:
def drop_na(dataset, column):
    """
    Drops rows with missing values in the specified column from the dataset

    Parameters:
    dataset: The dataset containing the data (df)
    column: The name of the column where missing values should be checked and removed (str)
    """
    dataset.dropna(subset=[column], inplace=True)

drop_na(df, 'Grade')
drop_na(df, 'Gespreksdata')

In [None]:
# Dropping duplicates in the dataset

df.drop_duplicates(inplace=True)

### Cleaning Text Feature

In [1139]:
#Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["en", "is", "zijn", "was", "oke", "ok", "of", "dat", "voor",
                 "ons", "naar", "maar", "dus", "die", "bij", "een", "hebben", "dan", "mee", "daar",
                 "is", "heb", "zou", "wat", "kan", "aan", "iets", "hier",
                 "met", "moet", "gaan", 'deze', 'graag', 'alle', 'zeker', 'maken', 'nog', 'hoor', 'al', 'zouden', 'vanaf',
                 'toch', 'jij', 'zitten', 'waar', 'meer', 'gedaan', 'ben', 'geven', 'even', 'als', 'alles', 'doen', 'via', 
                 'kunnen', 'jullie', 'onze', 'door', 'mag', 'willen', 'staan', 'weet', 'krijgt', 'houden', 'gaat',
                 'geef', 'zien', 'daarna', 'wij', 'weten', 'komen', 'omdat', 'mijn', 'op', 'de', 'aan'])

In [1140]:
def remove_stopwords(text):
    """
    Cleans the chat message feature by removing single characters and stopwords.

    Parameters:
    text: The input feature consisting of text to be cleaned (str)

    Returns:
    Cleaned text (str)
    """
    
    text = ' '.join(word for word in text.split() if len(word) > 1)
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

df['Gespreksdata'] = df['Gespreksdata'].apply(remove_stopwords)

In [1141]:
def crop_messages(text):
    """
    Removes messages after the first four messages received from the customer.

    Parameters:
    text: The input feature consisting of text to be cleaned (str)

    Returns:
    Cleaned text (str)
    """
    
    # Define the pattern to match the time and date format
    pattern = r'\d{2}:\d{2}:\d{2} \d{2}-\d{2}-\d{4} \d+:'
    # Use regular expression to split the text into messages
    messages = re.split(pattern, text)
    
    # Remove messages like "sophie aan", "louise aan", "tess aan" and messages shorter than 1 token
    filtered_messages = [message for message in messages if not re.match(r'^[A-Z\s]+AAN\s', message) and len(message.split()) > 1]
    
    # Keep only the first 4 messages
    cropped_messages = ''.join(filtered_messages[:4])
    return cropped_messages

df['Gespreksdata'] = df['Gespreksdata'].apply(crop_messages)

In [None]:
def clean_text(text):
    """
    Cleans the remaining four messages by removing non-alphabetic characters.

    Parameters:
    text: The input feature consisting of text to be cleaned (str)

    Returns:
    Cleaned text (str)
    """
    
    cleaned_text = ''.join(char.lower() if char.isalpha() or char.isspace() else ' ' for char in text)
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

df['Gespreksdata'] = df['Gespreksdata'].apply(clean_text)

### Extracting Sentiment Score from Chat Message feature

In [1145]:
def compute_sentiment_score(text):
    blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    return blob.sentiment[0]

def compute_subjectivity_score(text):
    blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    return blob.sentiment[1]

df['sentiment_score'] = df['Gespreksdata'].apply(compute_sentiment_score)
df['subjectivity_score'] = df['Gespreksdata'].apply(compute_subjectivity_score)

In [None]:
df.drop(columns=['subjectivity_score'], inplace=True)

In [1146]:
df['Land'] = df['Land'].astype(int)

### Rounding Grades

In [1147]:
# Round grades to the nearest half integer
df['Rounded_Grade'] = df['Grade'].apply(lambda x: 1.0 if x == 0.0 else round(x * 2) / 2)

grade_counts = df['Rounded_Grade'].value_counts()

# Filter out rows with rounded grades that only exist once
df_filtered = df[df['Rounded_Grade'].isin(grade_counts[grade_counts > 1].index)]

df_filtered.drop(columns=['Grade'], inplace=True)
df_filtered.rename(columns={'Rounded_Grade': 'Grade'}, inplace=True)
df = df_filtered

In [1152]:
df.drop_duplicates(inplace=True)

## Train-Test Split

In [1156]:
y = df['Grade']
train, test = train_test_split(df, test_size=0.15, random_state=42, stratify=y)

## Resampling

### Undersampling + Synonym oversampling

In [1159]:
# Filter rows where Grade is lower than 7
subset_train = train[train['Grade'] < 7]

# Load synonyms data
synonyms_data = pd.read_csv('synonyms.tsv', sep='\t', header=None, names=['Word', 'Synonym1', 'Synonym2', 'Synonym3', 'Synonym4', 'Synonym5'], usecols=[0, 1, 2, 3, 4, 5])

# Create a dictionary of synonyms
synonyms_dict = {}
for index, row in synonyms_data.iterrows():
    word = row['Word']
    synonyms = row.dropna().tolist()[1:]  # Exclude the first column (Word)
    synonyms_dict[word] = synonyms

# Iterate over rows in subset_df and replace words with synonyms
for index, row in subset_train.iterrows():
    text_feature = row['Gespreksdata']
    tokens = text_feature.split()
    new_text = []
    for token in tokens:
        if token in synonyms_dict:
            synonym = random.choice(synonyms_dict[token])
            new_text.append(synonym)
        else:
            new_text.append(token)
    new_text_feature = ' '.join(new_text)
    subset_train.at[index, 'Gespreksdata'] = new_text_feature

# Concatenate subset_df and df
train = pd.concat([subset_train, train], ignore_index=True)

In [1160]:
# RANDOM UNDERSAMPLING

train['above_7'] = (train['Grade'] >= 7).astype(int)

X = train.drop(columns=['above_7'])
y = train['above_7']

rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)

X_resampled, y_resampled = rus.fit_resample(X, y)

X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['above_7'])

# Merge X and y back together
resampled_data = pd.concat([X_resampled_df, y_resampled_df], axis=1)

train = resampled_data.drop(columns=['above_7'])

print(train['Grade'].value_counts())

Grade
8.0     1524
7.0     1201
9.0      995
6.0      870
10.0     803
5.0      450
1.0      318
4.0      242
3.0      182
2.0      156
6.5      128
8.5      114
7.5       61
9.5       26
5.5       12
2.5        4
Name: count, dtype: int64


## Save Train and Test set
These resampled train and test set, with a maximum of 4 chats will be used for all models with experiments for max. 4 chats

In [1163]:
train.to_csv('train4.csv', index=False)
test.to_csv('test4.csv', index=False)