# Initialize

## Packages

In [None]:
# Google-Drive Mounting
import os
import sys

# Pandas, numpy
import pandas as pd
import numpy as np

#Sklearn (Performance Metric Calculation)
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, cohen_kappa_score

#Random for setting seeds
import random

#Import pickle
import pickle

#Import widgets
import ipywidgets

#NLTK
!pip install nltk
import nltk

#HanTa (Hannover Tager) (for Lemmatization)
!pip install HanTa
from HanTa import HanoverTagger as ht

#Import catboost
!pip install catboost
from catboost.text_processing import Tokenizer, Dictionary
from catboost import CatBoostClassifier, Pool, metrics, cv

## Mounting

In [None]:
# Google-Drive Mounting

from google.colab import drive
drive.mount('/content/drive')

# Working Directory
os.chdir('drive/MyDrive')
!ls

# Read data

## Read: BBGCPure with Finetuning and without data augmentation

In [None]:
#Path
abspath = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_train_fulldataset.pkl')

# pkl
with open(str(abspath), 'rb') as pkl:
  training_bbgcpure = pickle.load(pkl)

#Path
abspath = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_test_fulldataset.pkl')

# pkl
with open(str(abspath), 'rb') as pkl:
  testing_bbgcpure = pickle.load(pkl)

#Path
abspath = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_valid_fulldataset.pkl')

# pkl
with open(str(abspath), 'rb') as pkl:
  validation_bbgcpure = pickle.load(pkl)

# Data preprocessing

## General function: Text preprocessing

In [None]:
#Align / correct accident descriptions
def correct_participant(df):
  df_new = df.copy()
  #Replace certain words, but not case sensitive -> lower / upper cases do not matter
  #Participant 1
  df_new["Description"]= df_new["Description"].str.replace("ON 01", "Beteiligter 1", case = False)
  df_new["Description"]= df_new["Description"].str.replace("ON01", "Beteiligter 1", case = False)
  df_new["Description"]= df_new["Description"].str.replace("01", "Beteiligter 1", case = False)
  df_new["Description"]= df_new["Description"].str.replace("Teilnehmer 1", "Beteiligter 1", case = False) #relevant für data augmentation
  #Participant 2
  df_new["Description"]= df_new["Description"].str.replace("ON 02", "Beteiligter 2", case = False)
  df_new["Description"]= df_new["Description"].str.replace("ON02", "Beteiligter 2", case = False)
  df_new["Description"]= df_new["Description"].str.replace("02", "Beteiligter 2", case = False)
  df_new["Description"]= df_new["Description"].str.replace("Teilnehmer 2", "Beteiligter 2", case = False) #relevant für data augmentation
  ##_x000D_
  df_new["Description"] = df_new["Description"].str.replace("_x000D_", " ", case = False)
  df_new["Description"] = df_new["Description"].str.replace("\n", " ", case = False)
  #Return
  return df_new

#Correct empty accident descriptions
def correct_na(df):
  df.Description = df.Description.fillna('Keine Unfallbeschreibung vorhanden.')
  return df

#Split into single paragraphs
def split_paragraphs(data):
  #Donwload nltk
  nltk.download("punkt")
  #Split into list of sentences
  data["Description"] = data.apply(lambda row: nltk.tokenize.sent_tokenize(row["Description"]), axis = 1)
  return data


#Main function
def main_correct(df):
  #Align accident descriptions
  df_lang = correct_participant(df)
  #Correct empty accident descriptions
  df_na = correct_na(df_lang)
  #Split into single paragraphs
  #df_split = split_paragraphs(df_na)
  #Rückgabe
  return df_na


## General functions

In [None]:
#General functions

#Force pandas to show all columns
pd.set_option('display.max_columns', None)


#List with all 47 possible 3ATs
classes_list = [201, 202, 203, 204, 209,
                211, 212, 213, 214, 215, 219,
                221, 222, 223, 224, 225, 229,
                231, 232, 233, 239,
                241, 242, 243, 244, 245, 249,
                251, 252, 259,
                261, 262, 269,
                271, 272, 273, 274, 275, 279,
                281, 282, 283, 284, 285, 286, 289,
                299]

#Recode
def recode(df, classes_list):
  #Copy existing dataframe
  df_new = df.copy()
  #Set labels to zero according to classlist
  df_new['AccidentType'] = df_new.apply(lambda x: classes_list.index(x['AccidentType']), axis = 1)
  return df_new



#Find variables with missing values
def find_missingvalues(data):
  null_value_stats = data.isnull().sum(axis=0)
  result = null_value_stats[null_value_stats != 0]
  #Print
  #print(result)

#Replace missing values with 9999
def fill_na(data, fillna):
  #df = data.copy()
  if (fillna):
    #Fill all NA with 9999
    na_values_filled = data.fillna("-9999")
    df = na_values_filled
    #Fill all NaN with 9999
    #df = df.replace(np.na, -9999)
  return df


#Delete unnecessary columns not needed for prediction
def delete_columns(data, twodigit, threedigit):
  #Copy
  df = data.copy()
  #Delete ID
  df.drop('ID', inplace = True, axis = 1)
  #Delete Year: Prediction should not dependent on the year!
  df.drop('Year', inplace = True, axis = 1)
  #Delete Hour (is redundant, due to variable HourMinute)
  df.drop('Hour', inplace = True, axis = 1)
  #Delete Accidenttypetwo (if boolean = yes)
  if (twodigit):
    df.drop('AccidentType2', inplace = True, axis = 1)
  #Delete accdienttypethree (if boolean = yes)
  if (threedigit):
    df.drop('AccidentType', inplace = True, axis = 1)
  #Delete Accident Description
  #if (nodescription):
    #df.drop('Description', inplace = True, axis = 1)
  #Return
  new = df.copy()
  return new



#Convert string objects to integers (i.e. categorical variables)
def convert_strings(data, cat_columns):
  #Copy
  df = data.copy()
  #Convert the object columns to integers, based on the index of the categorical values and transform it then to categories
  df.loc[:, cat_columns] = df.loc[:, cat_columns].apply(lambda col:pd.Categorical(col).codes)
  df.loc[:, cat_columns] = df.loc[:, cat_columns].apply(lambda col:col.astype("category"))
  #Convert the wrongly classified object columns to numerical ones
  df.loc[:, num_columns] = df.loc[:, num_columns].apply(lambda col:col.astype("int"))
  #Return
  return df


#Classify all categorical variables as categorical (important for later prediction)
def classify_categorical(data, cat_columns):
  #Copy
  df = data.copy()
  #Convert to categorical type
  df.loc[:, cat_columns] = df.loc[:, cat_columns].apply(lambda col:col.astype("category"))
  #Return
  return df


#Numerical features must be treated as embedding
def convert_to_embedding(data):
  #Make copy
  df = data.copy()
  #Concatenate all 768 columns of Bert-Feature into one column to be able to treat it as embedding
  df['Embedding'] = df.select_dtypes(include = ['float32']).apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
  #Drop all 768 columns
  df = df.select_dtypes(exclude = ['float32'])
  #Convert embedding column to list of floats
  df.loc[:, 'Embedding'] = df.loc[:, 'Embedding'].apply(lambda col: [float(item) for item in col.split(sep=",") ])
  return df



#Main function
def main_prepare(data, fillna, twodigit, threedigit, cat_columns):
  #Print missing values
  find_missingvalues(data)
  #Correct text (see general functions "Text preprocessing")
  df = main_correct(data)
  #Recode accidenttype / labels
  df = recode(df, classes_list)
  #Replace missing values
  df = fill_na(df, fillna)
  #Convert strings to integers and classify them as category
  df = convert_strings(df, cat_columns)
  #Classify categorical
  #df = classify_categorical(df, cat_columns)
  #Convert BERT-columns (768 columns) to one embedding
  df = convert_to_embedding(df)
  #Delete unnecessary columns
  df = delete_columns(df, twodigit, threedigit)
  #Return
  return df


## Prepare: BBGCPure with Finetuning and without data augmentation

In [None]:
#Categorical columns
cat_columns = ["Month", "Weekday", "StreetClass", "RoadCondition", "LightCondition", "Weather" , "Obstacle","Urban", "Rural", "CyclePath", "Sidewalk", "TrafficLightOn", "TrafficLightOff", "Alcohol", "Drugs", "Medicines", "SpeedLimit", "HitAndRun", "CollisionType","Cause", "Participant1", "InjuryP1", "CauseP1","Participant2", "CauseP2", "InjuryP2"]
num_columns = ["AgeP1", "AgeP2", "PropertyDamage", "TextLength"]
text_columns = ["Description"]
emb_columns = ["Embedding"]


#Datasets with only three digit accident types (twodigit is deleted), with replaced values
train_bbgcpure_three = main_prepare(training_bbgcpure, True, True, False, cat_columns)
test_bbgcpure_three = main_prepare(testing_bbgcpure, True, True, False,  cat_columns)
valid_bbgcpure_three = main_prepare(validation_bbgcpure, True, True, False,  cat_columns)

#Datasets with only two digit accident types (threedigit is deleted), with replaced values
train_bbgcpure_two = main_prepare(training_bbgcpure, True, False, True,  cat_columns)
test_bbgcpure_two = main_prepare(testing_bbgcpure, True, False, True, cat_columns)
valid_bbgcpure_two = main_prepare(validation_bbgcpure, True, False, True,  cat_columns)

#Head
train_bbgcpure_three.head()

In [None]:
#Separate features and label variable

#List with different subset of variables
cat_num_col = cat_columns + num_columns
cat_num_txt_col = cat_columns + num_columns + text_columns

#3-digit
X_train_three_all = train_bbgcpure_three.drop(columns = ["AccidentType"])
X_train_three = train_bbgcpure_three.drop(columns = ["AccidentType", "Description"]) #without desription
X_train_three_num = train_bbgcpure_three.select_dtypes(include=['float32']) #Just numerical features
X_train_three_cat = train_bbgcpure_three.filter(cat_num_col) #All features except numerical features, but no description
X_train_three_cat_txt = train_bbgcpure_three.filter(cat_num_txt_col) #All features except numerical features, and with description
Y_train_three = train_bbgcpure_three.AccidentType

X_test_three_all = test_bbgcpure_three.drop(columns = ["AccidentType"])
X_test_three = test_bbgcpure_three.drop(columns = ["AccidentType", "Description"]) #without desription
X_test_three_num = test_bbgcpure_three.select_dtypes(include=['float32']) #Just numerical features
X_test_three_cat = test_bbgcpure_three.filter(cat_num_col) #All features except numerical features, but no description
X_test_three_cat_txt = test_bbgcpure_three.filter(cat_num_txt_col) #All features except numerical features, and with description
Y_test_three = test_bbgcpure_three.AccidentType

X_valid_three_all = valid_bbgcpure_three.drop(columns = ["AccidentType"])
X_valid_three = valid_bbgcpure_three.drop(columns = ["AccidentType", "Description"]) #without desription
X_valid_three_num = valid_bbgcpure_three.select_dtypes(include=['float32']) #Just numerical features
X_valid_three_cat = valid_bbgcpure_three.filter(cat_num_col) #All features except numerical features, but no description
X_valid_three_cat_txt = valid_bbgcpure_three.filter(cat_num_txt_col) #All features except numerical features, and with description
Y_valid_three = valid_bbgcpure_three.AccidentType


#2-digit
X_train_two_all = train_bbgcpure_two.drop(columns = ["AccidentType2"])
X_train_two = train_bbgcpure_two.drop(columns = ["AccidentType2", "Description"]) #without desription
X_train_two_num = train_bbgcpure_two.select_dtypes(include=['float32']) #Just numerical features
X_train_two_cat = train_bbgcpure_two.filter(cat_num_col) #All features except numerical features, but no description
X_train_two_cat_txt = train_bbgcpure_two.filter(cat_num_txt_col) #All features except numerical features, and with description
Y_train_two = train_bbgcpure_two.AccidentType2

X_test_two_all = test_bbgcpure_two.drop(columns = ["AccidentType2"])
X_test_two = test_bbgcpure_two.drop(columns = ["AccidentType2", "Description"]) #without desription
X_test_two_num = test_bbgcpure_two.select_dtypes(include=['float32']) #Just numerical features
X_test_two_cat = test_bbgcpure_two.filter(cat_num_col) #All features except numerical features, but no description
X_test_two_cat_txt = test_bbgcpure_two.filter(cat_num_txt_col) #All features except numerical features, and with description
Y_test_two = test_bbgcpure_two.AccidentType2

X_valid_two_all = valid_bbgcpure_two.drop(columns = ["AccidentType2"])
X_valid_two = valid_bbgcpure_two.drop(columns = ["AccidentType2", "Description"]) #without desription
X_valid_two_num = valid_bbgcpure_two.select_dtypes(include=['float32']) #Just numerical features
X_valid_two_cat = valid_bbgcpure_two.filter(cat_num_col) #All features except numerical features, but no description
X_valid_two_cat_txt = valid_bbgcpure_two.filter(cat_num_txt_col) #All features except numerical features, and with description
Y_valid_two = valid_bbgcpure_two.AccidentType2

In [None]:
#Inspect dataset
#Head
Y_train_three.head()
#Variable types
#X_train_three.info(verbose = True)

In [None]:
X_train_three.head()

## CatBoost: Text preprocessing

In [None]:
#Functions for Tokenizing, Dictionary, etc.


#Initialize Tokenizer
#https://catboost.ai/en/docs/concepts/python-reference_tokenizer
tokenizer = Tokenizer(
        separator_type = 'BySense',
        lowercasing = True,
        number_process_policy = "LeaveAsIs",
        skip_empty = True,
        token_types = ['Word', 'Number', 'SentenceBreak', 'ParagraphBreak'],
        sub_tokens_policy = 'SeveralTokens')


def tokenize_data(df, tokenizer):
  #Copy
  df_new = df.copy()
  #Tokenize
  df_new['Description'] = df_new.apply(lambda x: tokenizer.tokenize(x['Description']), axis = 1)
  #Return
  return df_new


def lemmatize_sentence(tokenlist):
  #Load trained model
  tagger = ht.HanoverTagger('morphmodel_ger.pgz')
  #Lemmatize
  tokens = [tagger.analyze(word, taglevel = 1) for word in tokenlist]
  #Lemmas, without POS
  lemmalist = [lemma[0] for lemma in tokens]
  #Return
  return lemmalist


def lemmatize_data(df):
  #https://textmining.wp.hs-hannover.de/Preprocessing.html (HanTa = HannoverTagger, published in 2019 here: https://doi.org/10.25968/opus-1527 )
  #Copy
  df_new = df.copy()
  # Iterate over data frame and apply lemmatization
  df_new["Description"] = df_new.apply(lambda x: lemmatize_sentence(x["Description"]), axis = 1)
  #Return
  return df_new


#Convert tokenized data
#Join tokenized text again to one string
def jointostring(df):
  #Copy
  df_new = df.copy()
  #Joinstring
  df_new["Description"] = df_new.apply(lambda x: ' '.join(x["Description"]), axis = 1)
  #Return
  return df_new


def main_textpreprocessing(df, tokenizer, token, lemma, join):
  #Tokenize Data
  if token:
    df = tokenize_data(df, tokenizer)
  #Lemmatize Data
  if lemma:
    df = lemmatize_data(df)
  #Convert it back to string (Catboost does not accept tokenized data as input, tokenizer must be defined within catboost)
  #But at least the data is not lemmatized.
  if join:
    df = jointostring(df)
  #Return
  return df


In [None]:
#Preprocess data for prediction: 3-digit-accident type (3AT)
#The text data is tokenized, then lemmatized (only possible if tokenized) and then de-tokenized again, as the actual tokenizer must be addressed directly via Catboost.
#This is, so to speak, a detour for lemmatization.

#Training data
X_train_three_all_pre = main_textpreprocessing(X_train_three_all, tokenizer, token = True, lemma = True, join = True)
#Test data
X_test_three_all_pre = main_textpreprocessing(X_test_three_all, tokenizer, token = True, lemma = True, join = True)
#Validation data
X_valid_three_all_pre = main_textpreprocessing(X_valid_three_all, tokenizer, token = True, lemma = True, join = True)

In [None]:
#Preprocess data for prediction: 2-digit-accident type (2AT)
#The text data is tokenized, then lemmatized (only possible if tokenized) and then de-tokenized again, as the actual tokenizer must be addressed directly via Catboost.
#This is, so to speak, a detour for lemmatization.

#Training data
X_train_two_all_pre = main_textpreprocessing(X_train_two_all, tokenizer, token = True, lemma = True, join = True)
#Test data
X_test_two_all_pre = main_textpreprocessing(X_test_two_all, tokenizer, token = True, lemma = True, join = True)
#Validation data
X_valid_two_all_pre = main_textpreprocessing(X_valid_two_all, tokenizer, token = True, lemma = True, join = True)

In [None]:
#Save preprocessed data: 3-digit-accident type (3AT)

#Training data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_train_three_all_pre.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(X_train_three_all_pre, handle, protocol = pickle.HIGHEST_PROTOCOL)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_train_three.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(Y_train_three, handle, protocol = pickle.HIGHEST_PROTOCOL)

#Test data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_test_three_all_pre.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(X_test_three_all_pre, handle, protocol = pickle.HIGHEST_PROTOCOL)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_test_three.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(Y_test_three, handle, protocol = pickle.HIGHEST_PROTOCOL)

#Validation data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_valid_three_all_pre.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(X_valid_three_all_pre, handle, protocol = pickle.HIGHEST_PROTOCOL)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_valid_three.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(Y_valid_three, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Save preprocessed data: 2-digit-accident type (2AT)

#Training data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_train_two_all_pre.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(X_train_two_all_pre, handle, protocol = pickle.HIGHEST_PROTOCOL)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_train_two.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(Y_train_two, handle, protocol = pickle.HIGHEST_PROTOCOL)

#Test data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_test_two_all_pre.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(X_test_two_all_pre, handle, protocol = pickle.HIGHEST_PROTOCOL)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_test_two.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(Y_test_two, handle, protocol = pickle.HIGHEST_PROTOCOL)

#Validation data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_valid_two_all_pre.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(X_valid_two_all_pre, handle, protocol = pickle.HIGHEST_PROTOCOL)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_valid_two.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(Y_valid_two, handle, protocol = pickle.HIGHEST_PROTOCOL)

# CatBoost: Load preprocessed data

## Import

In [None]:
#Import preprocessed data: 3-digit accident type (3AT)

#Training data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_train_three_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_train_three_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_train_three.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_train_three = pickle.load(pkl)

#Test data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_test_three_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_test_three_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_test_three.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_test_three = pickle.load(pkl)

#Validation data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_valid_three_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_valid_three_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_valid_three.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_valid_three = pickle.load(pkl)

In [None]:
#Import preprocessed data: 2-digit accident type (2AT)

#Training data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_train_two_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_train_two_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_train_two.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_train_two = pickle.load(pkl)

#Test data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_test_two_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_test_two_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_test_two.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_test_two = pickle.load(pkl)

#Validation data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_valid_two_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_valid_two_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_valid_two.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_valid_two = pickle.load(pkl)

## Classes

In [None]:
#List with all possible 47 accident types
classes_list = [201, 202, 203, 204, 209,
                211, 212, 213, 214, 215, 219,
                221, 222, 223, 224, 225, 229,
                231, 232, 233, 239,
                241, 242, 243, 244, 245, 249,
                251, 252, 259,
                261, 262, 269,
                271, 272, 273, 274, 275, 279,
                281, 282, 283, 284, 285, 286, 289,
                299]

#Column specifications
cat_columns = ["Month", "Weekday", "StreetClass", "RoadCondition", "LightCondition", "Weather" , "Obstacle","Urban", "Rural", "CyclePath", "Sidewalk", "TrafficLightOn", "TrafficLightOff", "Alcohol", "Drugs", "Medicines", "SpeedLimit", "HitAndRun", "CollisionType","Cause", "Participant1", "InjuryP1", "CauseP1","Participant2", "CauseP2", "InjuryP2"]
num_columns = ["AgeP1", "AgeP2", "PropertyDamage", "TextLength"]
text_columns = ["Description"]
emb_columns = ["Embedding"]

## Class weights

In [None]:
#Function to get class weights

def get_weights(Y_data):
  #get weigths
  weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(Y_data), y = Y_data)

  #Convert to interpretable dict
  classes = list(np.unique(Y_data))
  class_weights = dict(zip(classes, weights))

  #Return
  return class_weights

#2-digit (2AT)
Y_train_two_weights_balanced = get_weights(Y_train_two)
print(Y_train_two_weights_balanced)
#3-digit (3AT)
Y_train_three_weights_balanced = get_weights(Y_train_three)
print(Y_train_three_weights_balanced)

## Pooling

In [None]:
#Pool data: 3-digit-accident type (3AT)
train_three_data = Pool(X_train_three_all_pre,
                  label = Y_train_three,
                  cat_features = [X_train_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_train_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


test_three_data = Pool(X_test_three_all_pre,
                  label = Y_test_three,
                  cat_features = [X_test_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_test_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


valid_three_data = Pool(X_valid_three_all_pre,
                  label = Y_valid_three,
                  cat_features = [X_valid_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_valid_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


In [None]:
X_train_two_all_pre

In [None]:
#Pool data: 2-digit-accident type (2AT)
train_two_data = Pool(X_train_two_all_pre,
                  label = Y_train_two,
                  cat_features = [X_train_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_train_two_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


test_two_data = Pool(X_test_two_all_pre,
                  label = Y_test_two,
                  cat_features = [X_test_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_test_two_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


valid_two_data = Pool(X_valid_two_all_pre,
                  label = Y_valid_two,
                  cat_features = [X_valid_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_valid_two_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])

#Catboost: Experiments with flat model design

##Basic model configuration

In [None]:
#Basic model initialization

#https://catboost.ai/en/docs/references/text-processing__specification-example

#Training parameters: https://catboost.ai/en/docs/references/training-parameters/



def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        loss_function = "MultiClass",
        random_seed = 42,
        eval_metric='Accuracy',
        **kwargs,
        #################
        l2_leaf_reg = 3.0, #Default value is 3.0 and gives best values.
        depth = 5, #5 =84,0% / depth 10 is too slow and bad. / default value = 6
        one_hot_max_size = 0, #This means that no feature is one-hot-encoded -> default value is different from 0
        bootstrap_type = 'Bayesian', #Bernoulli, Poisson and NO worse than Bayesian, MVS not for Multiclass GPU -> default = Bayesian
        bagging_temperature = 0.7, #Just for Bayesian bootstrap, default value = 1
        sampling_frequency = 'PerTree', #Default level = PerTreeLevel
        sampling_unit = 'Object',
        grow_policy = 'SymmetricTree',
        has_time = False,
        leaf_estimation_method = 'Newton',
        #leaf_estimation_iterations = 5,
        fold_len_multiplier = 2,
        approx_on_full_history = False,
        auto_class_weights = 'None',
        boosting_type = 'Plain',
        boost_from_average = False, #Defaul value = False
        score_function = 'Cosine',
        ################
        ##Overfitting detection
        od_type='Iter',
        od_wait=500,
        ################
        tokenizers = [ {
            "tokenizer_id" : "Sense",
            "separator_type" : "BySense",
            "lowercasing" : "true",
            "number_process_policy": "LeaveAsIs",
            "skip_empty": 'true',
            'token_types':['Word', 'Number', 'SentenceBreak', 'ParagraphBreak'],
            'sub_tokens_policy': 'SeveralTokens',
        }],

        dictionaries = [{
            "dictionary_id" : "BiGram",
            "gram_order" : "2",
            'min_token_occurence': '1',
            'max_dictionary_size': '150000'
        }, {
            "dictionary_id" : "Word",
            "gram_order" : "1",
            'min_token_occurence': '1',
            'max_dictionary_size': '150000'
        }],

        feature_calcers = [
                          'BoW:top_tokens_count=1000',
                          'BoW:dictionary_names=BiGram',
                          'NaiveBayes:top_tokens_count=1000',
                          'NaiveBayes:dictionary_names=Word',
                          'BM25:top_tokens_count=1000',
                          'BM25:dictionary_names=Word'


        ]
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=50,
        plot=True,
        use_best_model=True)


In [None]:
#Function to get classification report with correct accident type labels

def get_report(true, pred, labellist=None):
  #Get initial report
  report = classification_report(true, pred, output_dict=True, labels = np.unique(true))
  #Transform it to pandas
  report_pd = pd.DataFrame(report).transpose()
  #Get true labels
  if labellist != None:
    #Get indicies
    index_list = list(report_pd.index)
    #Remove last three leements
    index_list_short = index_list[:len(index_list)-3]
    #Convert list elements to integer
    index_list_short = [int(x) for x in index_list_short]
    #Get true class labels based on index
    label_series = pd.Series(labellist)
    true_labels = list(label_series[index_list_short])
    #Get new classification report with correct labels
    report_new = classification_report(true, pred, output_dict=True, target_names = true_labels)
    #Transform to pandas again
    report_new_pd = pd.DataFrame(report_new).transpose()
    #Print report
    print(report_new_pd)
    #Return
    return true_labels, report_new_pd
  else:
    print(report_pd)
    return report_pd

##V1: Catboost kct/num

Catboost model trained in basic configuration just with categorical / numerical data, but without embeddings and without text data (accident description).

In [None]:
#Initialize model
#Training parameters: https://catboost.ai/en/docs/references/training-parameters/


model_v1 = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = ["Embedding", "Description"],
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_v1.save_model('0_Ergebnisse/Catboost/models/V1_220627_model_0.6891241578', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v1_test_pred_class = model_v1.predict(test_three_data)
#Report
v1_test_labels, v1_test_report = get_report( Y_test_three, v1_test_pred_class, classes_list)
#Save
v1_test_report.to_pickle('0_Ergebnisse/Catboost/results/V1_220627_test_report_0.6891241578')

In [None]:
#Predictions: Valid-Data

#Predict
v1_valid_pred_class = model_v1.predict(valid_three_data)
#Report
v1_valid_labels, v1_valid_report = get_report(Y_valid_three, v1_valid_pred_class, classes_list)


In [None]:
#Save report
v1_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V1_220627_valid_report_0.635688')

In [None]:
#Save prediction results
v1_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v1_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v1_valid_pred_class)], axis = 1)
v1_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v1_results.to_pickle('0_Ergebnisse/Catboost/results/V1_220627_all_results')

## V2: Catboost text

Catboost model trained in basic configuration just with text (accident description) data, but without embeddings and without categorical / numerical data.

In [None]:
#Initialize model

model_v2 = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = cat_columns + num_columns + emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_v2.save_model('0_Ergebnisse/Catboost/models/V2_220627_model_0.8046198268', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v2_test_pred_class = model_v2.predict(test_three_data)
#Report
v2_test_labels, v2_test_report = get_report(Y_test_three, v2_test_pred_class, classes_list)
#Save
v2_test_report.to_pickle('0_Ergebnisse/Catboost/results/V2_220627_test_report_0.8046198268')

In [None]:
#Predictions: Valid-Data

#Predict
v2_valid_pred_class = model_v2.predict(valid_three_data)
#Report
v2_valid_labels, v2_valid_report = get_report(Y_valid_three, v2_valid_pred_class, classes_list)


In [None]:
#Save report
v2_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V2_220627_valid_report_0.790892')

In [None]:
#Save prediction results
v2_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v2_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v2_valid_pred_class)], axis = 1)
v2_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v2_results.to_pickle('0_Ergebnisse/Catboost/results/V2_220627_all_results')

## V3a: Catboost cat/num/text without class_weights

Catboost model trained in basic configuration with text (description), categorical and numerical data, but without embeddings.

In [None]:
#Initialize model

model_v3 = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_v3.save_model('0_Ergebnisse/Catboost/models/V3_220627_model_0.8392685274', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v3_test_pred_class = model_v3.predict(test_three_data)
#Report
v3_test_labels, v3_test_report = get_report(Y_test_three, v3_test_pred_class, classes_list)
#Save
v3_test_report.to_pickle('0_Ergebnisse/Catboost/results/V3_220627_test_report_0.8392685274')

In [None]:
#Predictions: Valid-Data

#Predict
v3_valid_pred_class = model_v3.predict(valid_three_data)
#Report
v3_valid_labels, v3_valid_report = get_report(Y_valid_three, v3_valid_pred_class, classes_list)

In [None]:
#Save report
v3_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V3_220627_valid_report_0.808550')

In [None]:
#Save prediction results
v3_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v3_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v3_valid_pred_class)], axis = 1)
v3_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v3_results.to_pickle('0_Ergebnisse/Catboost/results/V3_220627_all_results')

## V3b: Catboost cat/num/text with class_weights

In [None]:
from sklearn.utils import class_weight

class_weights_three = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(Y_train_three), y = Y_train_three)

a = list(np.unique(Y_train_three))
b = class_weights_three

class_weights_three_catboost = dict(zip(a, b))
class_weights_three_catboost

In [None]:
from sklearn.utils import class_weight

def get_class_weights(Y_train):
  #Get class weights by sklearn function
  cw = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(Y_train), y = Y_train)
  #Transform into catboost format
  a_cw = list(np.unique(Y_train))
  b_cw = cw

  catboost_weights = dict(zip(a_cw, b_cw))

  return catboost_weights


In [None]:
#Initialize model

model_v3b = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = emb_columns ,
    ###############
    class_weights = class_weights_three_catboost,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


In [None]:
#Save catboost model
model_v3b.save_model('0_Ergebnisse/Catboost/models/V3b_220629_model_0.546679376', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v3b_test_pred_class = model_v3b.predict(test_three_data)
#Report
v3b_test_labels, v3b_test_report = get_report(Y_test_three, v3b_test_pred_class, classes_list)
#Save
v3b_test_report.to_pickle('0_Ergebnisse/Catboost/results/V3b_220629_test_report_0.546679376')

In [None]:
#Predictions: Valid-Data

#Predict
v3b_valid_pred_class = model_v3b.predict(valid_three_data)
#Report
v3b_valid_labels, v3b_valid_report = get_report(Y_valid_three, v3b_valid_pred_class, classes_list)

In [None]:
#Save report
v3b_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V3b_220629_valid_report_0.668216')

In [None]:
#Save prediction results
v3b_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v3b_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v3b_valid_pred_class)], axis = 1)
v3b_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v3b_results.to_pickle('0_Ergebnisse/Catboost/results/V3b_220629_all_results')

## V4: Catboost cat/num with BERT embeddings

Catboost model trained in basic configuration with categorical and numerical data and with BERT embeddings, but without text (description) processed by Catboost.

In [None]:
#Initialize model

model_v4 = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = text_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='CPU',
    )



In [None]:
#Save catboost model
model_v4.save_model('0_Ergebnisse/Catboost/models/V4_220627_model_0.7305101059', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v4_test_pred_class = model_v4.predict(test_three_data)
#Report
v4_test_labels, v4_test_report = get_report(classes_list, Y_test_three, v4_test_pred_class)
#Save
v4_test_report.to_pickle('0_Ergebnisse/Catboost/results/V4_220627_test_report_0.7305101059')

In [None]:
#Predictions: Valid-Data

#Predict
v4_valid_pred_class = model_v4.predict(valid_three_data)
#Report
v4_valid_labels, v4_valid_report = get_report(classes_list, Y_valid_three, v4_valid_pred_class)

In [None]:
#Save report
v4_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V4_220627_valid_report_0.663569')

In [None]:
#Save prediction results
v4_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v4_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v4_valid_pred_class)], axis = 1)
v4_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v4_results.to_pickle('0_Ergebnisse/Catboost/results/V4_220627_all_results')

## V5: Catboost kat/num/text with BERT embeddings

Catboost model trained in basic configuration with categorical, numerical and text data and with BERT embeddings.

In [None]:
#Initialize model

model_v5 = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = [],
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='CPU',
    )



In [None]:
#Save catboost model
model_v5.save_model('0_Ergebnisse/Catboost/models/V5_220627_model_0.8267564966', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v5_test_pred_class = model_v5.predict(test_three_data)
#Report
v5_test_labels, v5_test_report = get_report(Y_test_three, v5_test_pred_class, classes_list)
#Save
v5_test_report.to_pickle('0_Ergebnisse/Catboost/results/V5_220627_test_report_0.8267564966')

In [None]:
#Predictions: Valid-Data

#Predict
v5_valid_pred_class = model_v5.predict(valid_three_data)
#Report
v5_valid_labels, v5_valid_report = get_report( Y_valid_three, v5_valid_pred_class, classes_list)

In [None]:
#Save report
v5_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V5_220627_valid_report_0.774164')

In [None]:
#Save prediction results
v5_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v5_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v5_valid_pred_class)], axis = 1)
v5_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v5_results.to_pickle('0_Ergebnisse/Catboost/results/V5_220627_all_results')

## V6: Catboost only BERT embeddings

Catboost model trained only with BERT embeddings.

In [None]:
#Initialize model

model_v6 = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = cat_columns + num_columns + text_columns,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='CPU',
    )



In [None]:
#Save catboost model
model_v6.save_model('0_Ergebnisse/Catboost/models/V6_220909_model_0.7074109721', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v6_test_pred_class = model_v6.predict(test_three_data)
#Report
v6_test_labels, v6_test_report = get_report(Y_test_three, v6_test_pred_class, classes_list)
#Save
v6_test_report.to_pickle('0_Ergebnisse/Catboost/results/V6_220909_test_report_0.7074109721')

In [None]:
#Predictions: Valid-Data

#Predict
v6_valid_pred_class = model_v6.predict(valid_three_data)
#Report
v6_valid_labels, v6_valid_report = get_report(Y_valid_three, v6_valid_pred_class, classes_list)

In [None]:
#Save report
v6_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V6_220909_valid_report_0.682156')

In [None]:
#Save prediction results
v6_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v6_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v6_valid_pred_class)], axis = 1)
v6_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v6_results.to_pickle('0_Ergebnisse/Catboost/results/V6_220909_all_results')

## V7: Catboost text and BERT embeddings

Catboost model trained with text and BERT embeddings.

In [None]:
#Initialize model

model_v7 = fit_model(
    ################
    train_three_data, test_three_data,
    ignored_features = cat_columns + num_columns,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='CPU',
    )



In [None]:
#Save catboost model
model_v7.save_model('0_Ergebnisse/Catboost/models/V7_220909_model_0.8113570741', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v7_test_pred_class = model_v7.predict(test_three_data)
#Report
v7_test_labels, v7_test_report = get_report(Y_test_three, v7_test_pred_class, classes_list)
#Save
v7_test_report.to_pickle('0_Ergebnisse/Catboost/results/V7_220909_test_report_0.8113570741')

In [None]:
#Predictions: Valid-Data

#Predict
v7_valid_pred_class = model_v7.predict(valid_three_data)
#Report
v7_valid_labels, v7_valid_report = get_report( Y_valid_three, v7_valid_pred_class, classes_list)

In [None]:
#Save report
v7_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V7_220909_valid_report_0.803903')

In [None]:
#Save prediction results
v7_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v7_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v7_valid_pred_class)], axis = 1)
v7_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v7_results.to_pickle('0_Ergebnisse/Catboost/results/V7_220909_all_results')

## V8: Catboost with cat/num/text without class_weights, but with feature selection  

Catboost model trained after feature selection.




Create dataset just with cat/num features

In [None]:
#Pool data: 3-digit-accident type -> without embedding!

X = X_train_three_all_pre.drop(["Embedding", "Description"], axis = 1)
Xtest = X_test_three_all_pre.drop(["Embedding", "Description"], axis = 1)
Xvalid = X_valid_three_all_pre.drop(["Embedding", "Description"], axis = 1)


train_three_data_selection = Pool(X,
                  label = Y_train_three,
                  cat_features = [X.columns.get_loc(c) for c in cat_columns if c in X])


test_three_data_selection = Pool(Xtest,
                  label = Y_test_three,
                  cat_features = [Xtest.columns.get_loc(c) for c in cat_columns if c in Xtest])


valid_three_data_selection = Pool(Xvalid,
                  label = Y_valid_three,
                  cat_features = [Xvalid.columns.get_loc(c) for c in cat_columns if c in Xvalid])

Function to perform feature selection

In [None]:
from catboost import CatBoostClassifier, EShapCalcType, EFeaturesSelectionAlgorithm

#Reference: https://catboost.ai/en/docs/concepts/python-reference_catboost_select_features

def select_features(algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    model = CatBoostClassifier(iterations=2000, random_seed=0,  task_type = "GPU")
    summary = model.select_features(
        train_three_data_selection,
        eval_set=test_three_data_selection,
        #features_for_select= cat_columns,
        features_for_select = cat_columns + num_columns,
        num_features_to_select=1,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Approximate, #EShapCalcType.Regular
        #train_final_model=True,
        train_final_model = False,
        logging_level='Silent',
        plot=True,
    )
    print('Selected features:', summary['selected_features_names'])
    return summary

Perform feature selection with the following parameters:


*   RecursiveByLossFunctionChange
*   Steps = 10
*   Features for select: cat_columns + num_columns
*   Num features to select = 1
*   Train final model = False
*   task type = GPU



In [None]:
#https://catboost.ai/en/docs/concepts/python-reference_catboost_select_features




test_cat_num_stepsten = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)


#Conclusion, which features should be contained:
#CollisionType
#CauseP1
#Participant2
#Sidewalk

In [None]:
#Print results
test_cat_num_stepsten

#Save results
abspath = os.path.abspath('0_Ergebnisse/Catboost/220913_FS_LossChange_10steps_cat_num.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(test_cat_num_stepsten, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Feature Selection based on PredictionValueChange

test_cat_num_stepsten_predchange = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByPredictionValuesChange, steps=10)

In [None]:
#Print results
print(test_cat_num_stepsten_predchange)

#Save results
abspath = os.path.abspath('0_Ergebnisse/Catboost/220913_FS_PredChange_10steps_cat_num.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(test_cat_num_stepsten_predchange, handle, protocol = pickle.HIGHEST_PROTOCOL)

{'selected_features': [], 'eliminated_features_names': ['SpeedLimit', 'Weather', 'Drugs', 'Medicines', 'Cause', 'Participant1', 'Alcohol', 'HitAndRun', 'Weekday', 'Month', 'InjuryP2', 'Rural', 'LightCondition', 'RoadCondition', 'InjuryP1', 'TrafficLightOn', 'Sidewalk', 'Obstacle', 'CyclePath', 'TextLength', 'AgeP1', 'CauseP2', 'Urban', 'AgeP2', 'Participant2', 'TrafficLightOff', 'StreetClass', 'PropertyDamage', 'CauseP1', 'CollisionType'], 'loss_graph': {'main_indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'removed_features_count': [0, 9, 15, 19, 22, 25, 26, 27, 28, 29, 30], 'loss_values': [0.8955511278731381, 0.8893087511900288, 0.8832701396597197, 0.9062516727219562, 0.8785402704931249, 0.9155735785195508, 0.9332283776693718, 0.9373888063938385, 0.9512595106339121, 1.2786623998275644, 2.1342190890985417]}, 'eliminated_features': [28, 7, 14, 15, 18, 19, 13, 16, 1, 0, 23, 3, 6, 5, 20, 11, 10, 8, 9, 33, 29, 24, 2, 30, 22, 12, 4, 31, 21, 17], 'selected_features_names': []}


In [None]:
#Feature Selection based on ShapValueChange

#Most important features
#CollisionType
#CauseP1
#Participant2
#CyclePath
#(TrafficLightOn)

test_cat_num_stepsten_shapchange = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=10)

In [None]:
#Print results
print(test_cat_num_stepsten_shapchange)

#Save results
abspath = os.path.abspath('0_Ergebnisse/Catboost/220913_FS_ShapChange_10steps_cat_num.pkl')
with open(str(abspath), 'wb') as handle:
  pickle.dump(test_cat_num_stepsten_shapchange, handle, protocol = pickle.HIGHEST_PROTOCOL)

Test selected features on new model

In [None]:
#Pool data: 3-digit-accident type based on the results of the feature selection

#Contain all data except the specified features


####Test: Loss Value Change
#features = ["CollisionType", "CauseP1", "Participant2", "Sidewalk", "TrafficLightOn", "PropertyDamage", "TrafficLightOff","HitAndRun", "Description"]
####features = ["CollisionType", "CauseP1", "Participant2", "Sidewalk", "TrafficLightOn", "PropertyDamage", "TrafficLightOff", "Description"]
#features = ["CollisionType", "CauseP1", "Participant2", "Sidewalk", "TrafficLightOn", "PropertyDamage", "Description"]
#features = ["CollisionType", "CauseP1", "Participant2", "Sidewalk", "TrafficLightOn", "Description"]
features = ["CollisionType", "CauseP1", "Participant2", "Sidewalk", "Description"]
#features = ["CollisionType", "CauseP1", "Participant2",  "Description"]
#features = ["CollisionType", "CauseP1", "Description"]
#features = ["CollisionType", "Description"]
#####



Xtrain_fs = X_train_three_all_pre[features]
Xtest_fs = X_test_three_all_pre[features]
Xvalid_fs = X_valid_three_all_pre[features]


train_three_data_fs = Pool(Xtrain_fs,
                  label = Y_train_three,
                  cat_features = [Xtrain_fs.columns.get_loc(c) for c in cat_columns if c in Xtrain_fs],
                  text_features = ["Description"])


test_three_data_fs = Pool(Xtest_fs,
                  label = Y_test_three,
                  cat_features = [Xtest_fs.columns.get_loc(c) for c in cat_columns if c in Xtest_fs],
                  text_features = ["Description"])


valid_three_data_fs = Pool(Xvalid_fs,
                  label = Y_valid_three,
                  cat_features = [Xvalid_fs.columns.get_loc(c) for c in cat_columns if c in Xvalid_fs],
                  text_features = ["Description"])

In [None]:
#Model V3a

model_v3a_fs = fit_model(
    ################
    train_three_data_fs, test_three_data_fs,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_v3a_fs.save_model('0_Ergebnisse/Catboost/models/V3afs_220914_model_0.8363811357', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v3afs_test_pred_class = model_v3a_fs.predict(test_three_data_fs)
#Report
v3afs_test_labels, v3afs_test_report = get_report(Y_test_three, v3afs_test_pred_class, classes_list)
#Save
v3afs_test_report.to_pickle('0_Ergebnisse/Catboost/results/V3afs_220914_test_report_0.8363811357')

In [None]:
#Predictions: Valid-Data

#Predict
v3afs_valid_pred_class = model_v3a_fs.predict(valid_three_data_fs)
#Report
v3afs_valid_labels, v3afs_valid_report = get_report(Y_valid_three, v3afs_valid_pred_class, classes_list)

In [None]:
#Save report
v3afs_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V3afs_220914_valid_report_0.799123')

In [None]:
print(v3afs_valid_report)

In [None]:
#Save prediction results
v3afs_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v3afs_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v3afs_valid_pred_class)], axis = 1)
v3afs_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v3afs_results.to_pickle('0_Ergebnisse/Catboost/results/V3afs_220914_all_results')

In [None]:
print(v3afs_results)

Shap values

In [None]:
#Load existing model
#https://www.youtube.com/watch?v=ZkIxZ5xlMuI

model_v3 = CatBoostClassifier()

model_v3.load_model('0_Ergebnisse/Catboost/models/V3_220627_model_0.8392685274', format='cbm')

<catboost.core.CatBoostClassifier at 0x7fe72f2a86d0>

In [None]:
#Basic model just with categorical values for feature selection

model_cat_features = fit_model(
    ################
    train_three_data_selection, test_three_data_selection,
    #ignored_features = ["Embedding", "Description"] + num_columns,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )

In [None]:
!pip install shap

import shap
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(model_cat_features)
#shap_values = explainer.shap_values(Pool(X_train_three_all_pre.drop(["Embedding", "Description"]+num_columns, axis = 1), Y_train_three,cat_features = cat_columns))
shap_values = explainer.shap_values(Pool(X, Y_train_three, cat_features = cat_columns))

In [None]:
#Shap Values

shap_values = model_v3.get_feature_importance(train_three_data, type = "ShapValues")

In [None]:
shap_values

## V8b: Catboost with cat/num/text with class_weights, and selected features

Catboost model trained with selected features (see experiment V8) and class weights (comparable to V3b)


In [None]:
#Class weights

from sklearn.utils import class_weight

class_weights_three_fs = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(Y_train_three), y = Y_train_three)

a = list(np.unique(Y_train_three))
b = class_weights_three_fs

class_weights_three_catboost_fs = dict(zip(a, b))
class_weights_three_catboost_fs

In [None]:
#Initialize model

model_v8b = fit_model(
    ################
    train_three_data_fs, test_three_data_fs,
    #ignored_features = emb_columns ,
    ###############
    class_weights = class_weights_three_catboost_fs,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_v8b.save_model('0_Ergebnisse/Catboost/models/V8b_220914_model_0.5362665832', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
v8b_test_pred_class = model_v8b.predict(test_three_data_fs)
#Report
v8b_test_labels, v8b_test_report = get_report(Y_test_three, v8b_test_pred_class, classes_list)
#Save
v8b_test_report.to_pickle('0_Ergebnisse/Catboost/results/V8b_220914_test_report_0.5362665832')

In [None]:
#Predictions: Valid-Data

#Predict
v8b_valid_pred_class = model_v8b.predict(valid_three_data_fs)
#Report
v8b_valid_labels, v8b_valid_report = get_report(Y_valid_three, v8b_valid_pred_class, classes_list)

In [None]:
#Save report
v8b_valid_report.to_pickle('0_Ergebnisse/Catboost/results/V8b_220914_valid_report_0.728589')

In [None]:
#Save prediction results
v8b_results = pd.concat([pd.DataFrame(Y_test_three), pd.DataFrame(v8b_test_pred_class),pd.DataFrame(Y_valid_three), pd.DataFrame(v8b_valid_pred_class)], axis = 1)
v8b_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

v8b_results.to_pickle('0_Ergebnisse/Catboost/results/V8b_220914_all_results')

#Catboost: Experiments with LCPN design



## Dataset preparation for hierarchical classification

Adjust here, if features should be selected or not:

In [None]:
#Preparation of feature selection

#Selected features
#features = ["CollisionType", "CauseP1", "Participant2", "Sidewalk", "Description"] #use just selected features
features = [] #use alll features except embeddings

#When features are selected then reduce the dataset, otherwise leave it:
#When feature vector is empty and thus no features are selected
if not features:
  Xtrain_fs = X_train_three_all_pre
  Xtest_fs = X_test_three_all_pre
  Xvalid_fs = X_valid_three_all_pre
#When features are selected, then reduce the dataset:
if features:
  Xtrain_fs = X_train_three_all_pre[features]
  Xtest_fs = X_test_three_all_pre[features]
  Xvalid_fs = X_valid_three_all_pre[features]

In [None]:
Xvalid_fs

In [None]:


####
#Decode 2AT

twenty = [0, 1, 2, 3, 4]
twentyone = [5, 6, 7, 8, 9, 10]
twentytwo = [11, 12, 13, 14, 15, 16]
twentythree = [17, 18, 19, 20]
twentyfour = [21, 22, 23, 24, 25, 26]
twentyfive = [27, 28, 29]
twentysix = [30, 31, 32]
twentyseven = [33, 34, 35, 36, 37, 38]
twentyeight = [39, 40, 41, 42, 43, 44, 45]
twentynine = [46]

#Combine X and Y for filtering (will be splitted later again)
#Train
#XY_train_three_all_pre = pd.concat([X_train_three_all_pre, Y_train_three], axis = 1) #with all features
XY_train_three_all_pre = pd.concat([Xtrain_fs, Y_train_three], axis = 1) #with fs
#Test
#XY_test_three_all_pre = pd.concat([X_test_three_all_pre, Y_test_three], axis = 1) #with all features
XY_test_three_all_pre = pd.concat([Xtest_fs, Y_test_three], axis = 1) #with fs
#Valid
#XY_valid_three_all_pre = pd.concat([X_valid_three_all_pre, Y_valid_three], axis = 1) #with all features
XY_valid_three_all_pre = pd.concat([Xvalid_fs, Y_valid_three], axis = 1) #with fs

#Filter for the corresponding accident types
#Category 20
XY_train_20 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twenty)]
XY_test_20 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twenty)]
XY_valid_20 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twenty)]

X_train_20 = XY_train_20.drop(["AccidentType"], axis = 1)
Y_train_20 = XY_train_20.AccidentType

X_test_20 = XY_test_20.drop(["AccidentType"], axis = 1)
Y_test_20 = XY_test_20.AccidentType

X_valid_20 = XY_valid_20.drop(["AccidentType"], axis = 1)
Y_valid_20 = XY_valid_20.AccidentType

#Category 21
XY_train_21 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentyone)]
XY_test_21 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentyone)]
XY_valid_21 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentyone)]

X_train_21 = XY_train_21.drop(["AccidentType"], axis = 1)
Y_train_21 = XY_train_21.AccidentType

X_test_21 = XY_test_21.drop(["AccidentType"], axis = 1)
Y_test_21 = XY_test_21.AccidentType

X_valid_21 = XY_valid_21.drop(["AccidentType"], axis = 1)
Y_valid_21 = XY_valid_21.AccidentType


#Category 22
XY_train_22 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentytwo)]
XY_test_22 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentytwo)]
XY_valid_22 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentytwo)]

X_train_22 = XY_train_22.drop(["AccidentType"], axis = 1)
Y_train_22 = XY_train_22.AccidentType

X_test_22 = XY_test_22.drop(["AccidentType"], axis = 1)
Y_test_22 = XY_test_22.AccidentType

X_valid_22 = XY_valid_22.drop(["AccidentType"], axis = 1)
Y_valid_22 = XY_valid_22.AccidentType


#Category 23
XY_train_23 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentythree)]
XY_test_23 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentythree)]
XY_valid_23 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentythree)]

X_train_23 = XY_train_23.drop(["AccidentType"], axis = 1)
Y_train_23 = XY_train_23.AccidentType

X_test_23 = XY_test_23.drop(["AccidentType"], axis = 1)
Y_test_23 = XY_test_23.AccidentType

X_valid_23 = XY_valid_23.drop(["AccidentType"], axis = 1)
Y_valid_23 = XY_valid_23.AccidentType


#Category 24
XY_train_24 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentyfour)]
XY_test_24 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentyfour)]
XY_valid_24 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentyfour)]

X_train_24 = XY_train_24.drop(["AccidentType"], axis = 1)
Y_train_24 = XY_train_24.AccidentType

X_test_24 = XY_test_24.drop(["AccidentType"], axis = 1)
Y_test_24 = XY_test_24.AccidentType

X_valid_24 = XY_valid_24.drop(["AccidentType"], axis = 1)
Y_valid_24 = XY_valid_24.AccidentType


#Category 25
XY_train_25 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentyfive)]
XY_test_25 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentyfive)]
XY_valid_25 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentyfive)]

X_train_25 = XY_train_25.drop(["AccidentType"], axis = 1)
Y_train_25 = XY_train_25.AccidentType

X_test_25 = XY_test_25.drop(["AccidentType"], axis = 1)
Y_test_25 = XY_test_25.AccidentType

X_valid_25 = XY_valid_25.drop(["AccidentType"], axis = 1)
Y_valid_25 = XY_valid_25.AccidentType


#Category 26
XY_train_26 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentysix)]
XY_test_26 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentysix)]
XY_valid_26 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentysix)]

X_train_26 = XY_train_26.drop(["AccidentType"], axis = 1)
Y_train_26 = XY_train_26.AccidentType

X_test_26 = XY_test_26.drop(["AccidentType"], axis = 1)
Y_test_26 = XY_test_26.AccidentType

X_valid_26 = XY_valid_26.drop(["AccidentType"], axis = 1)
Y_valid_26 = XY_valid_26.AccidentType


#Category 27
XY_train_27 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentyseven)]
XY_test_27 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentyseven)]
XY_valid_27 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentyseven)]

X_train_27 = XY_train_27.drop(["AccidentType"], axis = 1)
Y_train_27 = XY_train_27.AccidentType

X_test_27 = XY_test_27.drop(["AccidentType"], axis = 1)
Y_test_27 = XY_test_27.AccidentType

X_valid_27 = XY_valid_27.drop(["AccidentType"], axis = 1)
Y_valid_27 = XY_valid_27.AccidentType


#Category 28
XY_train_28 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentyeight)]
XY_test_28 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentyeight)]
XY_valid_28 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentyeight)]

X_train_28 = XY_train_28.drop(["AccidentType"], axis = 1)
Y_train_28 = XY_train_28.AccidentType

X_test_28 = XY_test_28.drop(["AccidentType"], axis = 1)
Y_test_28 = XY_test_28.AccidentType

X_valid_28 = XY_valid_28.drop(["AccidentType"], axis = 1)
Y_valid_28 = XY_valid_28.AccidentType


#Category 29
XY_train_29 = XY_train_three_all_pre[XY_train_three_all_pre['AccidentType'].isin(twentynine)]
XY_test_29 = XY_test_three_all_pre[XY_test_three_all_pre['AccidentType'].isin(twentynine)]
XY_valid_29 = XY_valid_three_all_pre[XY_valid_three_all_pre['AccidentType'].isin(twentynine)]

X_train_29 = XY_train_29.drop(["AccidentType"], axis = 1)
Y_train_29 = XY_train_29.AccidentType

X_test_29 = XY_test_29.drop(["AccidentType"], axis = 1)
Y_test_29 = XY_test_29.AccidentType

X_valid_29 = XY_valid_29.drop(["AccidentType"], axis = 1)
Y_valid_29 = XY_valid_29.AccidentType


In [None]:
X_valid_22

In [None]:
#Pooling

#Category 20
train_20 = Pool(X_train_20,
                  label = Y_train_20,
                  cat_features = [X_train_20.columns.get_loc(c) for c in cat_columns if c in X_train_20],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_20 = Pool(X_test_20,
                  label = Y_test_20,
                  cat_features = [X_test_20.columns.get_loc(c) for c in cat_columns if c in X_test_20],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_20 = Pool(X_valid_20,
                  label = Y_valid_20,
                  cat_features = [X_valid_20.columns.get_loc(c) for c in cat_columns if c in X_valid_20],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

#Category 21
train_21 = Pool(X_train_21,
                  label = Y_train_21,
                  cat_features = [X_train_21.columns.get_loc(c) for c in cat_columns if c in X_train_21],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_21 = Pool(X_test_21,
                  label = Y_test_21,
                  cat_features = [X_test_21.columns.get_loc(c) for c in cat_columns if c in X_test_21],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_21 = Pool(X_valid_21,
                  label = Y_valid_21,
                  cat_features = [X_valid_21.columns.get_loc(c) for c in cat_columns if c in X_valid_21],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )


#Category 22
train_22 = Pool(X_train_22,
                  label = Y_train_22,
                  cat_features = [X_train_22.columns.get_loc(c) for c in cat_columns if c in X_train_22],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_22 = Pool(X_test_22,
                  label = Y_test_22,
                  cat_features = [X_test_22.columns.get_loc(c) for c in cat_columns if c in X_test_22],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_22 = Pool(X_valid_22,
                  label = Y_valid_22,
                  cat_features = [X_valid_22.columns.get_loc(c) for c in cat_columns if c in X_valid_22],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )


#Category 23
train_23 = Pool(X_train_23,
                  label = Y_train_23,
                  cat_features = [X_train_23.columns.get_loc(c) for c in cat_columns if c in X_train_23],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_23 = Pool(X_test_23,
                  label = Y_test_23,
                  cat_features = [X_test_23.columns.get_loc(c) for c in cat_columns if c in X_test_23],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_23 = Pool(X_valid_23,
                  label = Y_valid_23,
                  cat_features = [X_valid_23.columns.get_loc(c) for c in cat_columns if c in X_valid_23],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )


#Category 24
train_24 = Pool(X_train_24,
                  label = Y_train_24,
                  cat_features = [X_train_24.columns.get_loc(c) for c in cat_columns if c in X_train_24],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_24 = Pool(X_test_24,
                  label = Y_test_24,
                  cat_features = [X_test_24.columns.get_loc(c) for c in cat_columns if c in X_test_24],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_24 = Pool(X_valid_24,
                  label = Y_valid_24,
                  cat_features = [X_valid_24.columns.get_loc(c) for c in cat_columns if c in X_valid_24],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )


#Category 25
train_25 = Pool(X_train_25,
                  label = Y_train_25,
                  cat_features = [X_train_25.columns.get_loc(c) for c in cat_columns if c in X_train_25],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_25 = Pool(X_test_25,
                  label = Y_test_25,
                  cat_features = [X_test_25.columns.get_loc(c) for c in cat_columns if c in X_test_25],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_25 = Pool(X_valid_25,
                  label = Y_valid_25,
                  cat_features = [X_valid_25.columns.get_loc(c) for c in cat_columns if c in X_valid_25],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

#Category 26
train_26 = Pool(X_train_26,
                  label = Y_train_26,
                  cat_features = [X_train_26.columns.get_loc(c) for c in cat_columns if c in X_train_26],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_26 = Pool(X_test_26,
                  label = Y_test_26,
                  cat_features = [X_test_26.columns.get_loc(c) for c in cat_columns if c in X_test_26],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_26 = Pool(X_valid_26,
                  label = Y_valid_26,
                  cat_features = [X_valid_26.columns.get_loc(c) for c in cat_columns if c in X_valid_26],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )


#Category 27
train_27 = Pool(X_train_27,
                  label = Y_train_27,
                  cat_features = [X_train_27.columns.get_loc(c) for c in cat_columns if c in X_train_27],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_27 = Pool(X_test_27,
                  label = Y_test_27,
                  cat_features = [X_test_27.columns.get_loc(c) for c in cat_columns if c in X_test_27],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_27 = Pool(X_valid_27,
                  label = Y_valid_27,
                  cat_features = [X_valid_27.columns.get_loc(c) for c in cat_columns if c in X_valid_27],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

#Category 28
train_28 = Pool(X_train_28,
                  label = Y_train_28,
                  cat_features = [X_train_28.columns.get_loc(c) for c in cat_columns if c in X_train_28],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_28 = Pool(X_test_28,
                  label = Y_test_28,
                  cat_features = [X_test_28.columns.get_loc(c) for c in cat_columns if c in X_test_28],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_28 = Pool(X_valid_28,
                  label = Y_valid_28,
                  cat_features = [X_valid_28.columns.get_loc(c) for c in cat_columns if c in X_valid_28],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

#Category 29
train_29 = Pool(X_train_29,
                  label = Y_train_29,
                  cat_features = [X_train_29.columns.get_loc(c) for c in cat_columns if c in X_train_29],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

test_29 = Pool(X_test_29,
                  label = Y_test_29,
                  cat_features = [X_test_29.columns.get_loc(c) for c in cat_columns if c in X_test_29],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
               )

valid_29 = Pool(X_valid_29,
                  label = Y_valid_29,
                  cat_features = [X_valid_29.columns.get_loc(c) for c in cat_columns if c in X_valid_29],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                )

When feature selection

In [None]:
#Prepare two digit data with selected features

Xtrain_two_fs = X_train_two_all_pre[features]
Xtest_two_fs = X_test_two_all_pre[features]
Xvalid_two_fs = X_valid_two_all_pre[features]

In [None]:
Xtrain_two_fs

## M1: Local classifier per parent node (LCPN)



### First parent node: Two-digit-model

#### Load data

Data without feature selection

In [None]:
#Pool data: 2-digit-accident type
train_two_data = Pool(
                  X_train_two_all_pre, #without feature selection
                  label = Y_train_two,
                  cat_features = [X_train_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_train_two_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"]
                  )


test_two_data = Pool(
                  X_test_two_all_pre, #without feature selection
                  label = Y_test_two,
                  cat_features = [X_test_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_test_two_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"]
                  )


valid_two_data = Pool(
                  X_valid_two_all_pre, #without feature selection
                  label = Y_valid_two,
                  cat_features = [X_valid_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_valid_two_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"]
                  )

Data with feature selection

In [None]:
#Pool data: 2-digit-accident type
train_two_data_fs = Pool(
                  #X_train_two_all_pre, #without feature selection
                  Xtrain_two_fs, #with selected features
                  label = Y_train_two,
                  #cat_features = [X_train_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_train_two_all_pre],
                  cat_features = [Xtrain_two_fs.columns.get_loc(c) for c in cat_columns if c in Xtrain_two_fs],
                  text_features= ["Description"],
                  #embedding_features = ["Embedding"]
                  )


test_two_data_fs = Pool(
                  #X_test_two_all_pre, #without feature selection
                  Xtest_two_fs,
                  label = Y_test_two,
                  #cat_features = [X_test_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_test_two_all_pre],
                  cat_features = [Xtest_two_fs.columns.get_loc(c) for c in cat_columns if c in Xtest_two_fs],
                  text_features= ["Description"],
                  #embedding_features = ["Embedding"]
                  )


valid_two_data_fs = Pool(
                  #X_valid_two_all_pre, #without feature selection
                  Xvalid_two_fs,
                  label = Y_valid_two,
                  #cat_features = [X_valid_two_all_pre.columns.get_loc(c) for c in cat_columns if c in X_valid_two_all_pre],
                  cat_features = [Xvalid_two_fs.columns.get_loc(c) for c in cat_columns if c in Xvalid_two_fs],
                  text_features= ["Description"],
                  #embedding_features = ["Embedding"]
                  )

#### Model 2X

In [None]:
#Basic model initialization


#Training parameters: https://catboost.ai/en/docs/references/training-parameters/




def fit_m1_two_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        loss_function = "MultiClass",
        random_seed = 42,
        eval_metric='Accuracy',
        **kwargs,
        #################
        l2_leaf_reg = 3.0,
        ###START: MODIFIEABLE###
        depth = 5,
        ###END: MODIFIEABLE###
        one_hot_max_size = 0, #This means that no feature is one-hot-encoded -> default value is different from 0
        bootstrap_type = 'Bayesian',
        ###START: MODIFIEABLE###
        bagging_temperature = 0.6,
        ###END: MODIFIEABLE###
        sampling_frequency = 'PerTree', #Default level = PerTreeLevel
        sampling_unit = 'Object',
        grow_policy = 'SymmetricTree',
        has_time = False,
        leaf_estimation_method = 'Newton',
        #leaf_estimation_iterations = 5,
        fold_len_multiplier = 2,
        approx_on_full_history = False,
        auto_class_weights = 'None',
        boosting_type = 'Plain',
        boost_from_average = False,
        score_function = 'Cosine',
        ################
        ##Overfitting detection
        od_type='Iter',
        od_wait=500,
        ################
        tokenizers = [ {
            "tokenizer_id" : "Sense",
            "separator_type" : "BySense",
            "lowercasing" : "true",
            "number_process_policy": "LeaveAsIs",
            "skip_empty": 'true',
            'token_types':['Word', 'Number', 'SentenceBreak', 'ParagraphBreak'],
            'sub_tokens_policy': 'SeveralTokens',
        }],

        dictionaries = [{
            "dictionary_id" : "BiGram",
            "gram_order" : "2",
            'min_token_occurence': '1',
            'max_dictionary_size': '150000'
        }, {
            "dictionary_id" : "Word",
            "gram_order" : "1",
            'min_token_occurence': '1',
            'max_dictionary_size': '150000'
        }],

        feature_calcers = [
                          'BoW:top_tokens_count=1000',
                          'BoW:dictionary_names=BiGram',
                          'NaiveBayes:top_tokens_count=1000',
                          'NaiveBayes:dictionary_names=Word',
                          'BM25:top_tokens_count=1000',
                          'BM25:dictionary_names=Word'


        ]
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=50,
        plot=True,
        use_best_model=True)


M1_two_model with all features

In [None]:
#Initialize model
#Prediction of the two-digit-accident type

model_m1_two = fit_m1_two_model(
    ################
    train_two_data, test_two_data,
    ignored_features = emb_columns ,
    ###############
    class_weights = [],
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_m1_two.save_model('0_Ergebnisse/Catboost/models/M1_two_220629_model_0.885466795', format = "cbm")

M1_two_model with all features and with class weights (cw)

In [None]:
#Initialize model
#Prediction of the two-digit-accident type

model_m1_two_cw = fit_m1_two_model(
    ################
    train_two_data, test_two_data,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_two),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_m1_two_cw.save_model('0_Ergebnisse/Catboost/models/M1_two_cw_220916_model_0.6996135353', format = "cbm")

M1_two_model with feature selection (fs)

In [None]:
#Select the most suitable features

from catboost import CatBoostClassifier, EShapCalcType, EFeaturesSelectionAlgorithm


def select_features_lcpn(featurenumber, Xtrain, Ytrain, Xtest, Ytest, algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    #Drop not needed columns
    Xtrain = Xtrain.drop(["Embedding", "Description"], axis = 1)
    Xtest = Xtest.drop(["Embedding", "Description"], axis = 1)
    #Build model
    model = CatBoostClassifier(iterations=2000, random_seed=0,  task_type = "GPU")
    summary = model.select_features(
        #Training features
        Pool(Xtrain, Ytrain, cat_features = [Xtrain.columns.get_loc(c) for c in cat_columns if c in Xtrain]),
        #Evaluation features
        eval_set=Pool(Xtest, Ytest, cat_features = [Xtest.columns.get_loc(c) for c in cat_columns if c in Xtest]),
        #Features to be selected
        features_for_select = cat_columns + num_columns,
        #Number of features to be selected
        num_features_to_select= featurenumber,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Approximate, #EShapCalcType.Regular
        #train_final_model=True,
        train_final_model = False,
        logging_level='Silent',
        plot=True,
    )
    print('Selected features:', summary['selected_features_names'])
    return summary




In [None]:
#Perform feature selection
#Most important features:
#CollisionType
#CauseP1
#Participant2
#Sidewalk


feat_m1_two = select_features_lcpn(X_train_three_all_pre, Y_train_two, X_test_three_all_pre, Y_test_two, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_two = ["Description", "CollisionType", "CauseP1", "Participant2", "Sidewalk" ]

X_train_two = X_train_two_all_pre[features_m1_two]
X_test_two = X_test_two_all_pre[features_m1_two]

train_two_data_fs = Pool(X_train_two,
                  label = Y_train_two,
                  cat_features = [X_train_two.columns.get_loc(c) for c in cat_columns if c in X_train_two],
                  text_features= ["Description"])

test_two_data_fs = Pool(X_test_two,
                  label = Y_test_two,
                  cat_features = [X_test_two.columns.get_loc(c) for c in cat_columns if c in X_test_two],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_two_220914_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_two, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_two_220914_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize model
#Prediction of the two-digit-accident type WITH feature selection

model_m1_two_fs = fit_m1_two_model(
    ################
    train_two_data_fs, test_two_data_fs,
    #ignored_features = emb_columns ,
    ###############
    class_weights = [],
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_m1_two_fs.save_model('0_Ergebnisse/Catboost/models/M1_two_fs4_220916_model_0.8864292589', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
m1_two_test_pred_class = model_m1_two_fs.predict(test_two_data)
#Report
m1_two_test_report = get_report(Y_test_two, m1_two_test_pred_class, labellist = None)
#Save
m1_two_test_report.to_pickle('0_Ergebnisse/Catboost/results/M1_two_fs_220914_test_report_0.8845043311')

In [None]:
#Predictions: Valid-Data

#Predict
m1_two_valid_pred_class = model_m1_two_fs.predict(valid_two_data)
#Report
m1_two_valid_report = get_report(Y_valid_two, m1_two_valid_pred_class)

In [None]:
#Save report
m1_two_valid_report.to_pickle('0_Ergebnisse/Catboost/results/M1_two_fs_220914_valid_report_0.841048')

In [None]:
#Save prediction results
m1_two_results = pd.concat([pd.DataFrame(Y_test_two), pd.DataFrame(m1_two_test_pred_class),pd.DataFrame(Y_valid_two), pd.DataFrame(m1_two_valid_pred_class)], axis = 1)
m1_two_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

m1_two_results.to_pickle('0_Ergebnisse/Catboost/results/M1_two_fs_220914_all_results')

M1_two_model with feature selection and class weights

In [None]:
#Initialize model
#Prediction of the two-digit-accident type WITH feature selection AND WITH Class weights

model_m1_two = fit_m1_two_model(
    ################
    train_two_data_fs, test_two_data_fs,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_two),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


#Ergebnis: 14.09.2022 with feature selection and class weights
#bestTest = 0.691949208 | bestIteration = 109 | Shrink model to first iterations. 110 | approx. time for best model with GPU: 1.46s



In [None]:
#Save catboost model
model_m1_two.save_model('0_Ergebnisse/Catboost/models/M1_two_fs_cw_220914_model_0.691949208', format = "cbm")

### Second parent node: Three-digit-models

#### Model 20

with all features and class weights

In [None]:
#Initialize single models

model_m1_20 = fit_model(
    ################
    train_20, test_20,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_20),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


In [None]:
#Save catboost model
model_m1_20.save_model('0_Ergebnisse/Catboost/models/M1_20_cw_220916_model_0.7207721973', format = "cbm")

With feature selection

In [None]:
#Perform feature selection
#Take maximum four features
#CollisionType
#CauseP1
#Participant2
#CauseP2

feat_m1_20 = select_features_lcpn(4, X_train_20, Y_train_20, X_test_20, Y_test_20, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_20 = ["Description", "CauseP1", "CollisionType", "CauseP2", "Participant2" ]

X_train_20 = X_train_20[features_m1_20]
X_test_20 = X_test_20[features_m1_20]

train_20 = Pool(X_train_20,
                  label = Y_train_20,
                  cat_features = [X_train_20.columns.get_loc(c) for c in cat_columns if c in X_train_20],
                  text_features= ["Description"])

test_20 = Pool(X_test_20,
                  label = Y_test_20,
                  cat_features = [X_test_20.columns.get_loc(c) for c in cat_columns if c in X_test_20],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_20_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_20, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_20_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_20 = fit_model(
    ################
    train_20,
    ################
    test_20,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
#model_m1_20.save_model('0_Ergebnisse/Catboost/models/M1_20_220628_model_0.96875', format = "cbm")
model_m1_20.save_model('0_Ergebnisse/Catboost/models/M1_20_fs4_220916_model_0.96875', format = "cbm")

In [None]:
#Load saved model

model_m1_20 = CatBoostClassifier()
model_m1_20.load_model('0_Ergebnisse/Catboost/models/M1_20_220628_model_0.96875', format='cbm')

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_20 = fit_model(
    ################
    train_20, test_20,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_20),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_m1_20.save_model('0_Ergebnisse/Catboost/models/M1_20_fs_cw_220914_model_0.7085018686', format = "cbm")

#### Model 21

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
#Rural
#Medicines
#CauseP1
#SpeedLimit

feat_m1_21 = select_features_lcpn(4, X_train_21, Y_train_21, X_test_21, Y_test_21, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_21 = ["Description", "Rural", "Medicines", "CauseP1", "SpeedLimit" ]

X_train_21 = X_train_21[features_m1_21]
X_test_21 = X_test_21[features_m1_21]

train_21 = Pool(X_train_21,
                  label = Y_train_21,
                  cat_features = [X_train_21.columns.get_loc(c) for c in cat_columns if c in X_train_21],
                  text_features= ["Description"])

test_21 = Pool(X_test_21,
                  label = Y_test_21,
                  cat_features = [X_test_21.columns.get_loc(c) for c in cat_columns if c in X_test_21],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_21_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_21, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_21_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_21 = fit_model(
    ################
    train_21, test_21,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
#model_m1_21.save_model('0_Ergebnisse/Catboost/models/M1_21_220628_model_0.9878934625', format = "cbm")
model_m1_21.save_model('0_Ergebnisse/Catboost/models/M1_21_fs4_220916_model_0.985472155', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_21 = fit_model(
    ################
    train_21, test_21,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_21),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_m1_21.save_model('0_Ergebnisse/Catboost/models/M1_21_cw_220916_model_0.7625820956', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_21 = fit_model(
    ################
    train_21, test_21,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_21),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_m1_21.save_model('0_Ergebnisse/Catboost/models/M1_21_fs_cw_220914_model_0.7505045686', format = "cbm")

#### Model 22

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
#CyclePath
#Sidewalk
#TrafficLightOff
#CollisionType

feat_m1_22 = select_features_lcpn(4, X_train_22, Y_train_22, X_test_22, Y_test_22, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_22 = ["Description", 'CyclePath', 'Sidewalk', 'TrafficLightOff', 'CollisionType' ]

X_train_22 = X_train_22[features_m1_22]
X_test_22 = X_test_22[features_m1_22]

train_22 = Pool(X_train_22,
                  label = Y_train_22,
                  cat_features = [X_train_22.columns.get_loc(c) for c in cat_columns if c in X_train_22],
                  text_features= ["Description"])

test_22 = Pool(X_test_22,
                  label = Y_test_22,
                  cat_features = [X_test_22.columns.get_loc(c) for c in cat_columns if c in X_test_22],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_22_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_22, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_22_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_22 = fit_model(
    ################
    train_22, test_22,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
#model_m1_22.save_model('0_Ergebnisse/Catboost/models/M1_22_220628_model_0.7692307692', format = "cbm")
model_m1_22.save_model('0_Ergebnisse/Catboost/models/M1_22_fs4_220916_model_0.7362637363', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_22 = fit_model(
    ################
    train_22, test_22,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_22),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


In [None]:
#Save catboost model
model_m1_22.save_model('0_Ergebnisse/Catboost/models/M1_22_cw_220916_model_0.8403373182', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_22 = fit_model(
    ################
    train_22, test_22,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_22),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_m1_22.save_model('0_Ergebnisse/Catboost/models/M1_22_fs_cw_220914_model_0.8293699808', format = "cbm")

#### Model 23

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
#StreetClass
#CollisionType
#CauseP1
#Participant2

feat_m1_23 = select_features_lcpn(4, X_train_23, Y_train_23, X_test_23, Y_test_23, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_23 = ["Description", 'StreetClass', 'CollisionType', 'CauseP1', 'Participant2' ]

X_train_23 = X_train_23[features_m1_23]
X_test_23 = X_test_23[features_m1_23]

train_23 = Pool(X_train_23,
                  label = Y_train_23,
                  cat_features = [X_train_23.columns.get_loc(c) for c in cat_columns if c in X_train_23],
                  text_features= ["Description"])

test_23 = Pool(X_test_23,
                  label = Y_test_23,
                  cat_features = [X_test_23.columns.get_loc(c) for c in cat_columns if c in X_test_23],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_23_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_23, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_23_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_23 = fit_model(
    ################
    train_23, test_23,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


#Ergebnis: 28.06.2022
#bestTest = 0.9741935484 | bestIteration = 422 | Shrink model to first 423 iterations. | approx. time for best model with GPU: 4,83s

In [None]:
#Save catboost model
#model_m1_23.save_model('0_Ergebnisse/Catboost/models/M1_23_220628_model_0.9741935484', format = "cbm")
model_m1_23.save_model('0_Ergebnisse/Catboost/models/M1_23_fs4_220916_model_0.9741935484', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_23 = fit_model(
    ################
    train_23, test_23,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_23),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )

In [None]:
#Save catboost model
model_m1_23.save_model('0_Ergebnisse/Catboost/models/M1_23_fs_cw_220914_model_0.8815909068', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_23 = fit_model(
    ################
    train_23, test_23,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_23),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_m1_23.save_model('0_Ergebnisse/Catboost/models/M1_23_fs_cw_220914_model_0.8977272705', format = "cbm")

#### Model 24

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
#StreetClass
#CollisionType
#Participant2
#CauseP2

feat_m1_24 = select_features_lcpn(4, X_train_24, Y_train_24, X_test_24, Y_test_24, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_24 = ["Description", 'StreetClass', 'CollisionType', 'Participant2', 'CauseP2']

X_train_24 = X_train_24[features_m1_24]
X_test_24 = X_test_24[features_m1_24]

train_24 = Pool(X_train_24,
                  label = Y_train_24,
                  cat_features = [X_train_24.columns.get_loc(c) for c in cat_columns if c in X_train_24],
                  text_features= ["Description"])

test_24 = Pool(X_test_24,
                  label = Y_test_24,
                  cat_features = [X_test_24.columns.get_loc(c) for c in cat_columns if c in X_test_24],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_24_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_24, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_24_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_24 = fit_model(
    ################
    train_24, test_24,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
#model_m1_24.save_model('0_Ergebnisse/Catboost/models/M1_24_220628_model_0.6891891892', format = "cbm")
model_m1_24.save_model('0_Ergebnisse/Catboost/models/M1_24_fs4_220016_model_0.7702702703', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_24 = fit_model(
    ################
    train_24, test_24,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_24),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )

In [None]:
#Save catboost model
model_m1_24.save_model('0_Ergebnisse/Catboost/models/M1_24_cw_220916_model_0.5860650638', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_24 = fit_model(
    ################
    train_24, test_24,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_24),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_m1_24.save_model('0_Ergebnisse/Catboost/models/M1_24_fs_cw_220914_model_0.6505734715', format = "cbm")

#### Model 25

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
 #RoadCondition
 #CollisionType
 #SpeedLimit
 #TextLength

feat_m1_25 = select_features_lcpn(4, X_train_25, Y_train_25, X_test_25, Y_test_25, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_25 = ["Description", 'RoadCondition', 'CollisionType', 'SpeedLimit', 'TextLength' ]

X_train_25 = X_train_25[features_m1_25]
X_test_25 = X_test_25[features_m1_25]

train_25 = Pool(X_train_25,
                  label = Y_train_25,
                  cat_features = [X_train_25.columns.get_loc(c) for c in cat_columns if c in X_train_25],
                  text_features= ["Description"])

test_25 = Pool(X_test_25,
                  label = Y_test_25,
                  cat_features = [X_test_25.columns.get_loc(c) for c in cat_columns if c in X_test_25],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_25_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_25, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_25_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_25 = fit_model(
    ################
    train_25, test_25,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
#model_m1_25.save_model('0_Ergebnisse/Catboost/models/M1_25_220628_model_1.0', format = "cbm")
model_m1_25.save_model('0_Ergebnisse/Catboost/models/M1_25_fs4_220916_model_0.5', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_25 = fit_model(
    ################
    train_25, test_25,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_25),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )

In [None]:
#Save catboost model
model_m1_25.save_model('0_Ergebnisse/Catboost/models/M1_25_cw_220916_model_1', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_25 = fit_model(
    ################
    train_25, test_25,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_25),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


#Ergebnis:

In [None]:
#Save catboost model
model_m1_25.save_model('0_Ergebnisse/Catboost/models/M1_25_fs_cw_220914_model_1', format = "cbm")

#### Model 26

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
#Month
#StreetClass
#Weather
#CauseP1

feat_m1_26 = select_features_lcpn(4, X_train_26, Y_train_26, X_test_26, Y_test_26, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_26 = ["Description",  "Month", "StreetClass", "Weather", "CauseP1"]

X_train_26 = X_train_26[features_m1_26]
X_test_26 = X_test_26[features_m1_26]

train_26 = Pool(X_train_26,
                  label = Y_train_26,
                  cat_features = [X_train_26.columns.get_loc(c) for c in cat_columns if c in X_train_26],
                  text_features= ["Description"])

test_26 = Pool(X_test_26,
                  label = Y_test_26,
                  cat_features = [X_test_26.columns.get_loc(c) for c in cat_columns if c in X_test_26],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_26_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_26, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_26_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_26 = fit_model(
    ################
    train_26, test_26,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
#model_m1_26.save_model('0_Ergebnisse/Catboost/models/M1_26_220628_model_0.9473684211', format = "cbm")
model_m1_26.save_model('0_Ergebnisse/Catboost/models/M1_26_fs4_220916_model_0.9473684211', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_26 = fit_model(
    ################
    train_26, test_26,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_26),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )

In [None]:
#Save catboost model
model_m1_26.save_model('0_Ergebnisse/Catboost/models/M1_26_cw_220916_model_0.7754318618', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_26 = fit_model(
    ################
    train_26, test_26,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_26),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


#Ergebnis:

In [None]:
#Save catboost model
model_m1_26.save_model('0_Ergebnisse/Catboost/models/M1_26_fs_cw_220914_model_0.7428023032', format = "cbm")

#### Model 27

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
#CyclePath
#Sidewalk
#HitAndRun
#CollisionType

feat_m1_27 = select_features_lcpn(4, X_train_27, Y_train_27, X_test_27, Y_test_27, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_27 = ["Description", 'CyclePath', 'Sidewalk', 'HitAndRun', 'CollisionType' ]

X_train_27 = X_train_27[features_m1_27]
X_test_27 = X_test_27[features_m1_27]

train_27 = Pool(X_train_27,
                  label = Y_train_27,
                  cat_features = [X_train_27.columns.get_loc(c) for c in cat_columns if c in X_train_27],
                  text_features= ["Description"])

test_27 = Pool(X_test_27,
                  label = Y_test_27,
                  cat_features = [X_test_27.columns.get_loc(c) for c in cat_columns if c in X_test_27],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_27_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_27, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_27_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_27 = fit_model(
    ################
    train_27, test_27,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
#model_m1_27.save_model('0_Ergebnisse/Catboost/models/M1_27_220628_model_1.0', format = "cbm")
model_m1_27.save_model('0_Ergebnisse/Catboost/models/M1_27_fs4_220916_model_1.0', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_27 = fit_model(
    ################
    train_27, test_27,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_27),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )

In [None]:
#Save catboost model
model_m1_27.save_model('0_Ergebnisse/Catboost/models/M1_27_cw_220916_model_1', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_27 = fit_model(
    ################
    train_27, test_27,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_27),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_m1_27.save_model('0_Ergebnisse/Catboost/models/M1_27_fs_cw_220914_model_1', format = "cbm")

#### Model 28

with feature selection

In [None]:
#Perform feature selection
#Select maximum 4 features
#CollisionType
#Participant2
#InjuryP2
#PropertyDamage

feat_m1_28 = select_features_lcpn(4, X_train_28, Y_train_28, X_test_28, Y_test_28, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_m1_28 = ["Description", 'CollisionType', 'Participant2', 'InjuryP2', 'PropertyDamage'  ]

X_train_28 = X_train_28[features_m1_28]
X_test_28 = X_test_28[features_m1_28]

train_28 = Pool(X_train_28,
                  label = Y_train_28,
                  cat_features = [X_train_28.columns.get_loc(c) for c in cat_columns if c in X_train_28],
                  text_features= ["Description"])

test_28 = Pool(X_test_28,
                  label = Y_test_28,
                  cat_features = [X_test_28.columns.get_loc(c) for c in cat_columns if c in X_test_28],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_28_220916_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_m1_28, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_28_220916_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize single models

model_m1_28 = fit_model(
    ################
    train_28, test_28,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
#model_m1_28.save_model('0_Ergebnisse/Catboost/models/M1_28_220628_model_1.0', format = "cbm")
model_m1_28.save_model('0_Ergebnisse/Catboost/models/M1_28_fs4_220916_model_1.0', format = "cbm")

with all features and class weights

In [None]:
#Initialize single models

model_m1_28 = fit_model(
    ################
    train_28, test_28,
    ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_28),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )

In [None]:
#Save catboost model
model_m1_28.save_model('0_Ergebnisse/Catboost/models/M1_28_cw_220916_model_1', format = "cbm")

with feature selection and class weights

In [None]:
#Initialize single models

model_m1_28 = fit_model(
    ################
    train_28, test_28,
    #ignored_features = emb_columns ,
    ###############
    class_weights = get_weights(Y_train_28),
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_m1_28.save_model('0_Ergebnisse/Catboost/models/M1_28_fs_cw_220914_model_1', format = "cbm")

#### Model 29

In [None]:
#For 29 is no model required, since there is only the value 299.

### LCPN: Prediction pipeline

#### Functions

In [None]:
def predict_lcpn(X_data, cat_columns, model_two, model_20, model_21, model_22, model_23, model_24, model_25, model_26, model_27, model_28):


  #Assign ID
  X_data.loc[:,"ID"] = X_data.index


  #Pool data for catboost algorithm
  X_data_pool = Pool(X_data,
                  cat_features = [X_data.columns.get_loc(c) for c in cat_columns if c in X_data],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                  )

  #Predict two-digit accident type
  class_two_digit = model_two.predict(X_data_pool)

  #Combine prediction with original data
  #X_new = pd.concat([X_data, pd.DataFrame(class_two_digit)], axis = 1)
  X_n = X_data.copy()
  X_n.loc[:, "AccidentTypeTwo"] = class_two_digit
  #Rename last column, which is predicted two-digit accident type
  #X_new.columns = [*X_new.columns[:-1], 'AccidentTypeTwo']
  #Create ID column by index
  X_new = X_n.copy()
  #X_new.loc[:,"ID"] = X_new.index

  #Filter by the single accident types
  X_20 = X_new[X_new['AccidentTypeTwo'] == 20]
  X_21 = X_new[X_new['AccidentTypeTwo'] == 21]
  X_22 = X_new[X_new['AccidentTypeTwo'] == 22]
  X_23 = X_new[X_new['AccidentTypeTwo'] == 23]
  X_24 = X_new[X_new['AccidentTypeTwo'] == 24]
  X_25 = X_new[X_new['AccidentTypeTwo'] == 25]
  X_26 = X_new[X_new['AccidentTypeTwo'] == 26]
  X_27 = X_new[X_new['AccidentTypeTwo'] == 27]
  X_28 = X_new[X_new['AccidentTypeTwo'] == 28]
  X_29 = X_new[X_new['AccidentTypeTwo'] == 29]

  #Pool the single datasets for 3-digit-type prediction
  #Pooling is only feasible, when there is at least one row per dataframe

  if len(X_20) != 0:
    X_20_pool = Pool(X_20,
                    cat_features = [X_20.columns.get_loc(c) for c in cat_columns if c in X_20],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_21) != 0:
    X_21_pool = Pool(X_21,
                    cat_features = [X_21.columns.get_loc(c) for c in cat_columns if c in X_21],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_22) != 0:
    X_22_pool = Pool(X_22,
                    cat_features = [X_22.columns.get_loc(c) for c in cat_columns if c in X_22],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_23) != 0:
    X_23_pool = Pool(X_23,
                    cat_features = [X_23.columns.get_loc(c) for c in cat_columns if c in X_23],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_24) != 0:
    X_24_pool = Pool(X_24,
                    cat_features = [X_24.columns.get_loc(c) for c in cat_columns if c in X_24],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_25) != 0:
    X_25_pool = Pool(X_25,
                    cat_features = [X_25.columns.get_loc(c) for c in cat_columns if c in X_25],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_26) != 0:
    X_26_pool = Pool(X_26,
                    cat_features = [X_26.columns.get_loc(c) for c in cat_columns if c in X_26],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_27) != 0:
    X_27_pool = Pool(X_27,
                    cat_features = [X_27.columns.get_loc(c) for c in cat_columns if c in X_27],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  if len(X_28) != 0:
    X_28_pool = Pool(X_28,
                    cat_features = [X_28.columns.get_loc(c) for c in cat_columns if c in X_28],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  #Make predictions

  if len(X_20) != 0:
    class_20 = model_20.predict(X_20_pool)
  if len(X_21) != 0:
    class_21 = model_21.predict(X_21_pool)
  if len(X_22) != 0:
    class_22 = model_22.predict(X_22_pool)
  if len(X_23) != 0:
    class_23 = model_23.predict(X_23_pool)
  if len(X_24) != 0:
    class_24 = model_24.predict(X_24_pool)
  if len(X_25) != 0:
    class_25 = model_25.predict(X_25_pool)
  if len(X_26) != 0:
    class_26 = model_26.predict(X_26_pool)
  if len(X_27) != 0:
    class_27 = model_27.predict(X_27_pool)
  if len(X_28) != 0:
    class_28 = model_28.predict(X_28_pool)


  #Make copies
  X_20_new = X_20.copy()
  X_21_new = X_21.copy()
  X_22_new = X_22.copy()
  X_23_new = X_23.copy()
  X_24_new = X_24.copy()
  X_25_new = X_25.copy()
  X_26_new = X_26.copy()
  X_27_new = X_27.copy()
  X_28_new = X_28.copy()
  X_29_new = X_29.copy()

  #Assign predictions
  if len(X_20) != 0:
    X_20_new.loc[:,"AccidentType"] = class_20
  if len(X_21) != 0:
    X_21_new.loc[:,"AccidentType"] = class_21
  if len(X_22) != 0:
    X_22_new.loc[:,"AccidentType"] = class_22
  if len(X_23) != 0:
    X_23_new.loc[:,"AccidentType"] = class_23
  if len(X_24) != 0:
    X_24_new.loc[:,"AccidentType"] = class_24
  if len(X_25) != 0:
    X_25_new.loc[:,"AccidentType"] = class_25
  if len(X_26) != 0:
    X_26_new.loc[:,"AccidentType"] = class_26
  if len(X_27) != 0:
    X_27_new.loc[:,"AccidentType"] = class_27
  if len(X_28) != 0:
    X_28_new.loc[:, "AccidentType"] = class_28
  if len(X_29) != 0:
    X_29_new.loc[:, "AccidentType"] = 46

  #Bind data frames together
  X_total = pd.concat([X_20_new, X_21_new, X_22_new, X_23_new, X_24_new, X_25_new, X_26_new, X_27_new, X_28_new, X_29_new])

  #Sort by index in ascending order
  X_total = X_total.sort_values(by = ['ID'], axis = 0, ascending = True)

  #Return
  return X_total






#### Load models

In [None]:
#Load saved model

model_m1_two = CatBoostClassifier()
model_m1_two.load_model('0_Ergebnisse/Catboost/models/M1_two_220628_model_0.8835418672', format='cbm')

In [None]:
#Load saved model

model_m1_20 = CatBoostClassifier()
model_m1_20.load_model('0_Ergebnisse/Catboost/models/M1_20_220628_model_0.96875', format='cbm')

In [None]:
#Load saved model

model_m1_21 = CatBoostClassifier()
model_m1_21.load_model('0_Ergebnisse/Catboost/models/M1_21_220628_model_0.9878934625', format='cbm')

In [None]:
#Load saved model

model_m1_22 = CatBoostClassifier()
model_m1_22.load_model('0_Ergebnisse/Catboost/models/M1_22_220628_model_0.7692307692', format='cbm')

In [None]:
#Load saved model

model_m1_23 = CatBoostClassifier()
model_m1_23.load_model('0_Ergebnisse/Catboost/models/M1_23_220628_model_0.9741935484', format='cbm')

In [None]:
#Load saved model

model_m1_24 = CatBoostClassifier()
model_m1_24.load_model('0_Ergebnisse/Catboost/models/M1_24_220628_model_0.6891891892', format='cbm')

In [None]:
#Load saved model

model_m1_25 = CatBoostClassifier()
model_m1_25.load_model('0_Ergebnisse/Catboost/models/M1_25_220628_model_1.0', format='cbm')

In [None]:
#Load saved model

model_m1_26 = CatBoostClassifier()
model_m1_26.load_model('0_Ergebnisse/Catboost/models/M1_26_220628_model_0.9473684211', format='cbm')

In [None]:
#Load saved model

model_m1_27 = CatBoostClassifier()
model_m1_27.load_model('0_Ergebnisse/Catboost/models/M1_27_220628_model_1.0', format='cbm')

In [None]:
#Load saved model

model_m1_28 = CatBoostClassifier()
model_m1_28.load_model('0_Ergebnisse/Catboost/models/M1_28_220628_model_1.0', format='cbm')

### LCPN: Prediction

In [None]:
#Predictions: Test-Data

#Predict
m1_test = predict_lcpn(X_test_two_all_pre, cat_columns, model_m1_two_cw, model_m1_20, model_m1_21, model_m1_22, model_m1_23, model_m1_24, model_m1_25, model_m1_26, model_m1_27, model_m1_28) #without feature selection
#m1_test = predict_lcpn(Xtest_two_fs, cat_columns, model_m1_two, model_m1_20, model_m1_21, model_m1_22, model_m1_23, model_m1_24, model_m1_25, model_m1_26, model_m1_27, model_m1_28) #with feature selection
#Report
m1_test_report = get_report(Y_test_three, m1_test.AccidentType)
#Save
m1_test_report.to_pickle('0_Ergebnisse/Catboost/results/M1_cw_220917_test_report_0.814369 ')



In [None]:
#Predictions: Valid-Data

#Predict
m1_valid = predict_lcpn(X_valid_two_all_pre, cat_columns, model_m1_two_cw, model_m1_20, model_m1_21, model_m1_22, model_m1_23, model_m1_24, model_m1_25, model_m1_26, model_m1_27, model_m1_28) #without feature selection
#m1_valid = predict_lcpn(Xvalid_two_fs, cat_columns, model_m1_two, model_m1_20, model_m1_21, model_m1_22, model_m1_23, model_m1_24, model_m1_25, model_m1_26, model_m1_27, model_m1_28) #with feature selection
#Report
#labels, m1_valid_report = get_report(Y_valid_three, m1_valid.AccidentType, classes_list)
m1_valid_report = get_report(Y_valid_three, m1_valid.AccidentType)


In [None]:
#Save
m1_valid_report.to_pickle('0_Ergebnisse/Catboost/results/M1_cw_220917_valid_report_0.746629')

#abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_220914_valid_report_0.781415.pkl')
#with open(str(abspath), 'wb') as handle:
 # pickle.dump(m1_valid_report, handle, protocol = pickle.HIGHEST_PROTOCOL)


In [None]:
#Save prediction results
m1_test_results = m1_test.copy()
m1_test_results.loc[:, "Y_three"] = Y_test_three
m1_test_results.to_pickle('0_Ergebnisse/Catboost/results/M1_cw_220917_test_results')

m1_valid_results = m1_valid.copy()
m1_valid_results.loc[:, "Y_three"] = Y_valid_three
m1_valid_results.to_pickle('0_Ergebnisse/Catboost/results/M1_cw_220917_valid_results')



Oversampling: https://towardsdatascience.com/how-to-deal-with-imbalanced-multiclass-datasets-in-python-fe0bb3f2b669

# CatBoost: Experiments with frequent 3ATs

Reduction of the accident types to be predicted to those accident types for which sufficient samples are available and which can therefore be predicted with a high degree of probability.

## Import data

In [None]:
#Import preprocessed data: 3-digit accident type (3AT)

#Training data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_train_three_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_train_three_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_train_three.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_train_three = pickle.load(pkl)

#Test data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_test_three_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_test_three_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_test_three.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_test_three = pickle.load(pkl)

#Validation data
abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_X_valid_three_all_pre.pkl')
with open(str(abspath), 'rb') as pkl:
  X_valid_three_all_pre = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/220624_Y_valid_three.pkl')
with open(str(abspath), 'rb') as pkl:
  Y_valid_three = pickle.load(pkl)

In [None]:
#Column specifications

#Column specifications
cat_columns = ["Month", "Weekday", "StreetClass", "RoadCondition", "LightCondition", "Weather" , "Obstacle","Urban", "Rural", "CyclePath", "Sidewalk", "TrafficLightOn", "TrafficLightOff", "Alcohol", "Drugs", "Medicines", "SpeedLimit", "HitAndRun", "CollisionType","Cause", "Participant1", "InjuryP1", "CauseP1","Participant2", "CauseP2", "InjuryP2"]
num_columns = ["AgeP1", "AgeP2", "PropertyDamage", "TextLength"]
text_columns = ["Description"]
emb_columns = ["Embedding"]

## Aggregate Classes

Align Y-values

In [None]:
#Label rare accidents as rare -> two Groups
def label_rare(Y_three):

  Y = Y_three.copy()

  #Rare accident types
  #[2, 4, 7, 8, 9, 10, 15, 16, 19,
  #20, 26, 27, 28, 29, 31, 32, 34, 35, 36, 37, 38,
  #40, 41, 42, 43, 44, 45]

  Y.replace(0,1, inplace = True) #201
  Y.replace(1,1, inplace = True) #202
  #
  Y.replace(2,0, inplace = True) #203
  #
  Y.replace(3,1, inplace = True) #204
  #
  Y.replace(4,0, inplace = True) #209
  #
  Y.replace(5,1, inplace = True) #211
  #
  Y.replace(6,1, inplace = True) #212
  #
  Y.replace(7,0, inplace = True) #213
  Y.replace(8,0, inplace = True) #214
  Y.replace(9,0, inplace = True) #215
  #
  Y.replace(10,0, inplace = True) #219
  #
  Y.replace(11,1, inplace = True) #221
  #
  Y.replace(12,1, inplace = True) #222
  #
  Y.replace(13,1, inplace = True) #223
  #
  Y.replace(14,1, inplace = True) #224
  #
  Y.replace(15,0, inplace = True) #225
  Y.replace(16,0, inplace = True) #229
  #
  Y.replace(17,1, inplace = True) #231
  #
  Y.replace(18,1, inplace = True) #232
  #
  Y.replace(19,0, inplace = True) #233
  Y.replace(20,0, inplace = True) #239
  #
  Y.replace(21,1, inplace = True) #241
  #
  Y.replace(22,1, inplace = True) #242
  #
  Y.replace(23,1, inplace = True) #243
  #
  Y.replace(24,1, inplace = True) #244
  #
  Y.replace(25,1, inplace = True) #245
  #
  Y.replace(26,0, inplace = True) #249
  Y.replace(27,0, inplace = True) #251
  Y.replace(28,0, inplace = True) #252
  Y.replace(29,0, inplace = True) #259
  #
  Y.replace(30,1, inplace = True) #261
  #
  Y.replace(31,0, inplace = True) #262
  #
  Y.replace(32,0, inplace = True) #269
  #
  Y.replace(33,1, inplace = True) #271
  #
  Y.replace(34,0, inplace = True) #272
  Y.replace(35,0, inplace = True) #273
  Y.replace(36,0, inplace = True) #274
  Y.replace(37,0, inplace = True) #275
  Y.replace(38,0, inplace = True) #279
  #
  Y.replace(39,1, inplace = True) #281
  #
  Y.replace(40,0, inplace = True) #282
  Y.replace(41,0, inplace = True) #283
  Y.replace(42,0, inplace = True) #284
  Y.replace(43,0, inplace = True) #285
  Y.replace(44,0, inplace = True) #28
  Y.replace(45,0, inplace = True) #289
  #
  Y.replace(46,1, inplace = True) #299

  #Return
  return Y


Frequent / Rare

In [None]:
#Mark accident types as frequent / rare
#Train data
Y_train_three_freq = label_rare(Y_train_three)
#Test data
Y_test_three_freq = label_rare(Y_test_three)
#Valid data
Y_valid_three_freq = label_rare(Y_valid_three)

#Print
print(len(np.unique(Y_train_three_freq)))
#Folgende Klassen sollten vorhergesagt werden
print(np.unique(Y_valid_three_freq))

Data for subsequent model (after classification in rare / frequent)

In [None]:
#Rare accident types
rarelist = [2, 4, 7, 8, 9, 10, 15, 16, 19, 20, 26, 27, 28, 29, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45]

#Frequent accident types
freqlist = [0, 1, 3, 5, 6, 11, 12, 13, 14, 17, 18, 21, 22, 23, 24, 25, 30, 33, 39, 46]


#Prepare dataset
def prepare_frequent_data(X, Y, freqlist):

  #Combine X and Y
  XY = pd.concat([X, Y], axis = 1)

  #Filter for frequent accident types
  XY_freq = XY[XY["AccidentType"].isin(freqlist)]

  #Split data
  X_train = XY_freq.drop(["AccidentType"], axis = 1)
  Y_train = XY_freq.AccidentType

  #Return
  return X_train, Y_train

#Prepare data
X_train_three_onlyfreq, Y_train_three_onlyfreq = prepare_frequent_data(X_train_three_all_pre, Y_train_three, freqlist)

X_test_three_onlyfreq, Y_test_three_onlyfreq = prepare_frequent_data(X_test_three_all_pre, Y_test_three, freqlist)

X_valid_three_onlyfreq, Y_valid_three_onlyfreq = prepare_frequent_data(X_valid_three_all_pre, Y_valid_three, freqlist)



In [None]:
X_train_three_onlyfreq

## Pooling

Frequent / Rare

In [None]:
#Pool the data
#Pool data: 3-digit-accident type
train_three_data_freq = Pool(X_train_three_all_pre,
                  label = Y_train_three_freq,
                  cat_features = [X_train_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_train_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


test_three_data_freq = Pool(X_test_three_all_pre,
                  label = Y_test_three_freq,
                  cat_features = [X_test_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_test_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


valid_three_data_freq = Pool(X_valid_three_all_pre,
                  label = Y_valid_three_freq,
                  cat_features = [X_valid_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_valid_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])

In [None]:
#Pool the data
#Pool data: 3-digit-accident type
train_three_data_onlyfreq = Pool(X_train_three_onlyfreq,
                  label = Y_train_three_onlyfreq,
                  cat_features = [X_train_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_train_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


test_three_data_onlyfreq = Pool(X_test_three_onlyfreq,
                  label = Y_test_three_onlyfreq,
                  cat_features = [X_test_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_test_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])


valid_three_data_onlyfreq = Pool(X_valid_three_onlyfreq,
                  label = Y_valid_three_onlyfreq,
                  cat_features = [X_valid_three_all_pre.columns.get_loc(c) for c in cat_columns if c in X_valid_three_all_pre],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"])

## Model D1

Model Freq / Rare

without feature selection

In [None]:
#Initialize model

model_d1_freq = fit_model(
    ################
    train_three_data_freq, test_three_data_freq,
    ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )



In [None]:
#Save catboost model
model_d1_freq.save_model('0_Ergebnisse/Catboost/models/D1_freq_220918_model_0.9826756497', format = "cbm")

with feature selection

In [None]:
#Select the most suitable features

from catboost import CatBoostClassifier, EShapCalcType, EFeaturesSelectionAlgorithm


def select_features_freq(featurenumber, Xtrain, Ytrain, Xtest, Ytest, algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    #Drop not needed columns
    Xtrain = Xtrain.drop(["Embedding", "Description", "ID"], axis = 1, errors='ignore')
    Xtest = Xtest.drop(["Embedding", "Description", "ID"], axis = 1, errors='ignore')
    #Build model
    model = CatBoostClassifier(iterations=2000, random_seed=0,  task_type = "GPU")
    summary = model.select_features(
        #Training features
        Pool(Xtrain, Ytrain, cat_features = [Xtrain.columns.get_loc(c) for c in cat_columns if c in Xtrain]),
        #Evaluation features
        eval_set=Pool(Xtest, Ytest, cat_features = [Xtest.columns.get_loc(c) for c in cat_columns if c in Xtest]),
        #Features to be selected
        features_for_select = cat_columns + num_columns,
        #Number of features to be selected
        num_features_to_select= featurenumber,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Approximate, #EShapCalcType.Regular
        #train_final_model=True,
        train_final_model = False,
        logging_level='Silent',
        plot=True,
    )
    print('Selected features:', summary['selected_features_names'])
    return summary


In [None]:
#Perform feature selection

#Perform feature selection
#Most important features:
#Drugs
#Medicines
#CollisionType
#CauseP1


feat_d1_freq = select_features_freq(4, X_train_three_all_pre, Y_train_three_freq, X_test_three_all_pre, Y_test_three_freq, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_d1_freq = ["Description", 'Drugs', 'Medicines', 'CollisionType', 'CauseP1']

X_train_d1_freq = X_train_three_all_pre[features_d1_freq]
X_test_d1_freq = X_test_three_all_pre[features_d1_freq]

train_three_data_freq_fs = Pool(X_train_d1_freq,
                  label = Y_train_three_freq,
                  cat_features = [X_train_d1_freq.columns.get_loc(c) for c in cat_columns if c in X_train_d1_freq],
                  text_features= ["Description"])

test_three_data_freq_fs = Pool(X_test_d1_freq,
                  label = Y_test_three_freq,
                  cat_features = [X_test_d1_freq.columns.get_loc(c) for c in cat_columns if c in X_test_d1_freq],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/D1_freq_220918_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_d1_freq, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/D1_freq_220918_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize model

model_d1_freq_fs = fit_model(
    ################
    train_three_data_freq_fs, test_three_data_freq_fs,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


In [None]:
#Save catboost model
model_d1_freq_fs.save_model('0_Ergebnisse/Catboost/models/D1_freq_fs_220918_model_0.9826756497', format = "cbm")

## Submodel D1

Submodel for frequent accident types

without feature selection

In [None]:
#Initialize model WITH FEATURE SELECTION

model_d1_sub = fit_model(
    ################
    train_three_data_onlyfreq, test_three_data_onlyfreq,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )


In [None]:
#Save catboost model
model_d1_sub.save_model('0_Ergebnisse/Catboost/models/D1_sub_220918_model_0.8598039216', format = "cbm")

with feature selection

In [None]:
#Perform feature selection

#Most important features:
#CyclePath
#CollisionType
#CauseP1
#Pariticpant2


feat_d1_sub_freq = select_features_freq(4, X_train_three_onlyfreq, Y_train_three_onlyfreq, X_test_three_onlyfreq, Y_test_three_onlyfreq, algorithm=EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange, steps=10)

In [None]:
#Define feature vector with best features and get subset of full data set

features_d1_sub_freq = ["Description", 'CyclePath', 'CollisionType', 'CauseP1', 'Participant2' ]

X_train_d1_sub_freq = X_train_three_onlyfreq[features_d1_sub_freq]
X_test_d1_sub_freq = X_test_three_onlyfreq[features_d1_sub_freq]

train_three_data_onlyfreq_fs = Pool(X_train_d1_sub_freq,
                  label = Y_train_three_onlyfreq,
                  cat_features = [X_train_d1_sub_freq.columns.get_loc(c) for c in cat_columns if c in X_train_d1_sub_freq],
                  text_features= ["Description"])

test_three_data_onlyfreq_fs = Pool(X_test_d1_sub_freq,
                  label = Y_test_three_onlyfreq,
                  cat_features = [X_test_d1_sub_freq.columns.get_loc(c) for c in cat_columns if c in X_test_d1_sub_freq],
                  text_features= ["Description"])

In [None]:
#Save feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/D1_sub_freq_220918_feature_selection')
with open(str(abspath), 'wb') as handle:
  pickle.dump(feat_d1_sub_freq, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Open feature results

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/D1_sub_freq_220918_feature_selection')
with open(str(abspath), 'rb') as pkl:
  features = pickle.load(pkl)

features

In [None]:
#Initialize model WITH FEATURE SELECTION

model_d1_sub_fs = fit_model(
    ################
    train_three_data_onlyfreq_fs, test_three_data_onlyfreq_fs,
    #ignored_features = emb_columns ,
    ###############
    iterations=1700,
    learning_rate = 0.1, #Learning rate is dependend on number of iterations for Multiclass and can be overwritten here.
    ###############
    task_type='GPU',
    )




In [None]:
#Save catboost model
model_d1_sub_fs.save_model('0_Ergebnisse/Catboost/models/D1_sub_fs_220918_model_0.8558823529', format = "cbm")

In [None]:
#Predictions: Test-Data

#Predict
d1_sub_test_pred_class = model_d1_sub.predict(test_three_data_onlyfreq)
#Report
d1_sub_test_report = get_report(Y_test_three_onlyfreq, d1_sub_test_pred_class)


In [None]:
#Predictions: Valid-Data

#Predict
d1_sub_valid_pred_class = model_d1_sub.predict(valid_three_data_onlyfreq)
#Report
d1_sub_valid_report = get_report(Y_valid_three_onlyfreq, d1_sub_valid_pred_class)

## Pipeline D1 Prediction

In [None]:
def predict_frequent(X_data, cat_columns, model_d1_freq, model_d1_sub):

  #Assign ID
  X_data.loc[:, "ID"] = X_data.index

  #Pool data for catboost algorithm
  X_data_pool = Pool(X_data,
                  cat_features = [X_data.columns.get_loc(c) for c in cat_columns if c in X_data],
                  text_features= ["Description"],
                  embedding_features = ["Embedding"] #comment out for feature selection
                  )

  #Predict rare / frequent accident types
  #1 = frequent
  #0 = rare
  class_frequent = model_d1_freq.predict(X_data_pool)

  #Combine prediction with original data
  X_n = X_data.copy()
  X_n.loc[:, "Frequency"] = class_frequent
  X_new = X_n.copy()

  #Filter out the rare accidents -> we just want to make further predictions for the frequent accident types
  X_freq = X_new[X_new["Frequency"] == 1]
  X_rare = X_new[X_new["Frequency"] == 0]

  #Pool the new dataset for 3-digit-type prediction
  X_freq_pool = Pool(X_freq,
                    cat_features = [X_freq.columns.get_loc(c) for c in cat_columns if c in X_freq],
                    text_features= ["Description"],
                    embedding_features = ["Embedding"] #comment out for feature selection
                     )

  #Make 3-digit-type prediction
  class_at = model_d1_sub.predict(X_freq_pool)

  #Make copy
  X_freq_new = X_freq.copy()
  X_rare_new = X_rare.copy()

  #Assign predictions
  X_freq_new.loc[:, "AccidentType"] = class_at

  #if len(X_rare_new.loc[X_rare_new["Frequency"] == 0]) != 0:
    #X_rare_new.loc[:, "AccidentType"] = 999

  #if X_rare_new.loc[X_rare_new["Frequency"] == 0]) != 0:
  X_rare_new.loc[:, "AccidentType"] = 47


  #Bind data frames together
  X_total = pd.concat([X_freq_new, X_rare_new])

  #Sort by index in ascending order
  X_total = X_total.sort_values(by = ["ID"], axis = 0, ascending = True)

  #Return
  return X_total




In [None]:
#corrected list with 48 3ATs (including unknown )
classes_list_freq = [201, 202, 203, 204, 209,
                211, 212, 213, 214, 215, 219,
                221, 222, 223, 224, 225, 229,
                231, 232, 233, 239,
                241, 242, 243, 244, 245, 249,
                251, 252, 259,
                261, 262, 269,
                271, 272, 273, 274, 275, 279,
                281, 282, 283, 284, 285, 286, 289,
                299, 999]


#classes_list_freq_red = [201, 202, 204, 211, 212, 221, 222, 223, 224, 231, 232, 241, 242, 243, 244, 261, 271, 281, 299, 999 ]

In [None]:
#Function to get classification report with correct accident type labels

def get_report_freq(true, pred, labellist=None):

  #Copy
  Y_n = true.copy()

  #Prepare input data for the correct classification
  Y_rare = label_rare(Y_n)

  #Combine
  X_n = pd.DataFrame(Y_n.copy())
  X_n.loc[:, "Frequency"] = Y_rare
  X_n.loc[:, "AccidentType"] = Y_n

  #Rename rare accident types
  X_n.loc[X_n["Frequency"] == 0, "AccidentType"] = 47

  #Extract true label
  truelabel = X_n.AccidentType

  #Extract prediction
  predlabel = pred.AccidentType

  #Get initial report
  report = classification_report(truelabel, predlabel, output_dict=True, labels = np.unique(truelabel))
  #Transform it to pandas
  report_pd = pd.DataFrame(report).transpose()
  #Get true labels
  if labellist != None:
    #Get indicies
    index_list = list(report_pd.index)
    #Remove last three leements
    index_list_short = index_list[:len(index_list)-3]
    #Convert list elements to integer
    index_list_short = [int(x) for x in index_list_short]
    #Get true class labels based on index
    label_series = pd.Series(labellist)
    true_labels = list(label_series[index_list_short])
    #Get new classification report with correct labels
    report_new = classification_report(truelabel, predlabel, output_dict=True, target_names = true_labels)
    #Transform to pandas again
    report_new_pd = pd.DataFrame(report_new).transpose()
    #Print report
    print(report_new_pd)
    #Return corrected Y labels, true labels (accident types), report
    return truelabel, true_labels, report_new_pd
  else:
    print(report_pd)
    return report_pd

## D1 Prediction

### Without Feature Selection

In [None]:
#Test Prediction
#Predict
d1_test = predict_frequent(X_test_three_all_pre, cat_columns, model_d1_freq, model_d1_sub)

#Report
d1_test_report = get_report_freq(true = Y_test_three, pred = d1_test, labellist = classes_list_freq_red)

In [None]:
#Valid Prediction
#Predict
d1_valid = predict_frequent(X_valid_three_all_pre, cat_columns, model_d1_freq, model_d1_sub)

#Report
d1_valid_report = get_report_freq(true = Y_valid_three, pred = d1_valid)

### With Feature Selection

In [None]:
#Test Prediction
#Predict
d1_test_fs = predict_frequent(X_test_three_all_pre, cat_columns, model_d1_freq_fs, model_d1_sub_fs)

#Report
Y_test_three_freqresult, d1_test_fs_labels, d1_test_fs_report = get_report_freq(true = Y_test_three, pred = d1_test_fs,  labellist = classes_list_freq)

#Save
d1_test_fs_report.to_pickle('0_Ergebnisse/Catboost/results/D1_fs_221005_test_report_0.825458')

Validation data

In [None]:
#Valid Prediction
#Predict
d1_valid_fs = predict_frequent(X_valid_three_all_pre, cat_columns, model_d1_freq_fs, model_d1_sub_fs)

#Report
Y_valid_three_freqresult, d1_valid_fs_labels, d1_valid_fs_report = get_report_freq(true = Y_valid_three, pred = d1_valid_fs, labellist = classes_list_freq)

In [None]:
#Save report
d1_valid_fs_report.to_pickle('0_Ergebnisse/Catboost/results/D1_fs_221005_valid_report_0.808461')

In [None]:
#Save prediction results
d1_fs_results = pd.concat([pd.DataFrame(Y_test_three_freqresult), pd.DataFrame(d1_test_fs.AccidentType),pd.DataFrame(Y_valid_three_freqresult), pd.DataFrame(d1_valid_fs.AccidentType)], axis = 1)
d1_fs_results.columns =[ "Y_test", "Pred_test", "Y_valid", "Pred_valid"]

print(d1_fs_results)


d1_fs_results.to_pickle('0_Ergebnisse/Catboost/results/D1_fs_221005_all_results')

# Evaluation




### Overview over experiments

Overview aligned with written paper:

Paper | Colab

B1.4 | V3a

C1.1 | M1.1

C1.2 | M1.2

C1.3.1 | M1.3.1

C1.3.2 | M1.3.2

C1.4 | M1.4

C2.1 | V3b

C2.2 | V3afs

C2.3 | V3bfscw

D1   | D1fs



### Load classification reports

In [None]:
#Load classification reports for valid data from previous experiments

###########################Variantenvergleich##############################################
#V1
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V1_220627_valid_report_0.635688')
with open(str(abspath), 'rb') as pkl:
  v1_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V1_220627_test_report_0.6891241578')
with open(str(abspath), 'rb') as pkl:
  v1_test_report = pickle.load(pkl)

#V2
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V2_220627_valid_report_0.790892')
with open(str(abspath), 'rb') as pkl:
  v2_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V2_220627_test_report_0.8046198268')
with open(str(abspath), 'rb') as pkl:
  v2_test_report = pickle.load(pkl)

#V3a
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3_220627_valid_report_0.808550')
with open(str(abspath), 'rb') as pkl:
  v3a_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3_220627_test_report_0.8392685274')
with open(str(abspath), 'rb') as pkl:
  v3a_test_report = pickle.load(pkl)

#V3b = C2.1: All features / class weights / global
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3b_220629_valid_report_0.668216')
with open(str(abspath), 'rb') as pkl:
  v3b_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3b_220629_test_report_0.546679376')
with open(str(abspath), 'rb') as pkl:
  v3b_test_report = pickle.load(pkl)

#V4
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V4_220627_valid_report_0.663569')
with open(str(abspath), 'rb') as pkl:
  v4_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V4_220627_test_report_0.7305101059')
with open(str(abspath), 'rb') as pkl:
  v4_test_report = pickle.load(pkl)

#V5
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V5_220627_valid_report_0.774164')
with open(str(abspath), 'rb') as pkl:
  v5_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V5_220627_test_report_0.8267564966')
with open(str(abspath), 'rb') as pkl:
  v5_test_report = pickle.load(pkl)

#V6 (Bert)
abspath = os.path.abspath('0_Ergebnisse/Bert/results/V6_220702_valid_report_0.003')
with open(str(abspath), 'rb') as pkl:
  v6_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Bert/results/V6_220703_test_report_0.002')
with open(str(abspath), 'rb') as pkl:
  v6_test_report = pickle.load(pkl)

#V7 (Bert)
abspath = os.path.abspath('0_Ergebnisse/Bert/results/V7_220702_valid_report_0.80')
with open(str(abspath), 'rb') as pkl:
  v7_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Bert/results/V7_220703_test_report_0.81')
with open(str(abspath), 'rb') as pkl:
  v7_test_report = pickle.load(pkl)


#V8 (Bert)
abspath = os.path.abspath('0_Ergebnisse/Bert/results/V8_220702_valid_report_0.79')
with open(str(abspath), 'rb') as pkl:
  v8_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Bert/results/V8_220703_test_report_0.80')
with open(str(abspath), 'rb') as pkl:
  v8_test_report = pickle.load(pkl)


#V9 (V6 Catboost only BERT embeddings)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V6_220909_valid_report_0.682156')
with open(str(abspath), 'rb') as pkl:
  v9_valid_report = pickle.load(pkl)


abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V6_220909_test_report_0.7074109721')
with open(str(abspath), 'rb') as pkl:
  v9_test_report = pickle.load(pkl)

#V10 (V7 Catboost text and BERT embeddings)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V7_220909_valid_report_0.803903')
with open(str(abspath), 'rb') as pkl:
  v10_valid_report = pickle.load(pkl)


abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V7_220909_test_report_0.8113570741')
with open(str(abspath), 'rb') as pkl:
  v10_test_report = pickle.load(pkl)

#V3afs (V3 with just selected features)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3afs_220914_valid_report_0.799123')
with open(str(abspath), 'rb') as pkl:
  v3afs_valid_report = pickle.load(pkl)


abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3afs_220914_test_report_0.8363811357')
with open(str(abspath), 'rb') as pkl:
  v3afs_test_report = pickle.load(pkl)

#V3bfs (V3b with just selected features (fs) and class weights (cw))
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V8b_220914_valid_report_0.728589')
with open(str(abspath), 'rb') as pkl:
  v3bfscw_valid_report = pickle.load(pkl)


abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V8b_220914_test_report_0.5362665832')
with open(str(abspath), 'rb') as pkl:
  v3bfscw_test_report = pickle.load(pkl)

###########################Modellkonzeption##############################################

#M1.1 (LCPN) with all features
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_220629_valid_report_0.799257')
with open(str(abspath), 'rb') as pkl:
  m1_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_220629_test_report_0.834456')
with open(str(abspath), 'rb') as pkl:
  m1_test_report = pickle.load(pkl)


#M1.2 (LCPN) with all features and class weights (cw)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_cw_220917_valid_report_0.746629')
with open(str(abspath), 'rb') as pkl:
  m1cw_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_cw_220917_test_report_0.814369 ')
with open(str(abspath), 'rb') as pkl:
  m1cw_test_report = pickle.load(pkl)


#M1.3.1 (LCPN) with selected features (fs)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_220914_valid_report_0.781415')
with open(str(abspath), 'rb') as pkl:
  m1fs_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_220914_test_report_0.823512')
with open(str(abspath), 'rb') as pkl:
  m1fs_test_report = pickle.load(pkl)


#M1.3.2 (LCPN) with four selected features (fs) per sumbodel
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs4_220916_valid_report_0.761246')
with open(str(abspath), 'rb') as pkl:
  m1fs4_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs4_220916_test_report_0.817818 ')
with open(str(abspath), 'rb') as pkl:
  m1fs4_test_report = pickle.load(pkl)


#M1.4 (LCPN) with selected features (fs) and class weights (cw)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_cw_220914_valid_report_0.707274')
with open(str(abspath), 'rb') as pkl:
  m1fscw_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_cw_220914_test_report_0.715872')
with open(str(abspath), 'rb') as pkl:
  m1fscw_test_report = pickle.load(pkl)


###########################Aggregierte Vorhersage#########################################
##D1 (Flat) with rare/freq accidents and then freq-model with features (fs)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/D1_fs_221005_valid_report_0.808461')
with open(str(abspath), 'rb') as pkl:
  d1fs_valid_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/D1_fs_221005_test_report_0.825458')
with open(str(abspath), 'rb') as pkl:
  d1fs_test_report = pickle.load(pkl)





############################Optimierung####################################################

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/O1_220703_test_report_0.8277189605')
with open(str(abspath), 'rb') as pkl:
  o1_test_report = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/O1_220703_valid_report_0.822491')
with open(str(abspath), 'rb') as pkl:
  o1_valid_report = pickle.load(pkl)

In [None]:
v3bfscw_valid_report

In [None]:

#Save reports for plotting in R studio with ggplot2

#B1.4   f1-score valid: 0.792 / 17 predicted classes (V3a)
v3a_valid_report.to_csv(('0_Ergebnisse/Overall/B1.4_V3a_220627_valid_report.csv'))

#C1.3.1 f1-score valid: 0.781 / 18 predicted classes (M1fs)
m1fs_valid_report.to_csv(('0_Ergebnisse/Overall/C1.3.1_M1_fs_220914_valid_report.csv'))


#C2.1   f1-score valid: 0.706 / 23 predicted classes (V3b)
v3b_valid_report.to_csv(('0_Ergebnisse/Overall/C2.1_V3b_220629_valid_report.csv'))


#C2.2   f1-score valid: 0.799 / 17 predicted classes (v3afs)
v3afs_valid_report.to_csv(('0_Ergebnisse/Overall/C2.2_V3afs_220914_valid_report.csv'))

#C2.3   f1-score valid: 0.729 / 21 predicted classes (V3bfs)
v3bfscw_valid_report.to_csv(('0_Ergebnisse/Overall/C2.3_V3bfscw_220914_valid_report.csv'))

#D1fs   f1-score valid: 0.808461 / 18 predicted classes
d1fs_valid_report.to_csv(('0_Ergebnisse/Overall/D1_D1fs_221005_valid_report.csv'))





### Load prediction results

In [None]:
#Load prediction results of the single classifiers

###########################Variantenvergleich##############################################

#V1
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V1_220627_all_results')
with open(str(abspath), 'rb') as pkl:
  v1_results = pickle.load(pkl)

#V2
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V2_220627_all_results')
with open(str(abspath), 'rb') as pkl:
  v2_results = pickle.load(pkl)

#V3a
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3_220627_all_results')
with open(str(abspath), 'rb') as pkl:
  v3a_results = pickle.load(pkl)

#V3b = C2.1: All features / class weights / global
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3b_220629_all_results')
with open(str(abspath), 'rb') as pkl:
  v3b_results = pickle.load(pkl)

#V4
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V4_220627_all_results')
with open(str(abspath), 'rb') as pkl:
  v4_results = pickle.load(pkl)

#V5
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V5_220627_all_results')
with open(str(abspath), 'rb') as pkl:
  v5_results = pickle.load(pkl)


#V6 (Bert)
abspath = os.path.abspath('0_Ergebnisse/Bert/results/V6_220702_valid_preds_0.003')
with open(str(abspath), 'rb') as pkl:
  v6_results = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Bert/results/V6_220703_test_preds_0.002')
with open(str(abspath), 'rb') as pkl:
  v6_results_test = pickle.load(pkl)

#V7 (Bert)
abspath = os.path.abspath('0_Ergebnisse/Bert/results/V7_220702_valid_preds_0.80')
with open(str(abspath), 'rb') as pkl:
  v7_results = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Bert/results/V7_220703_test_preds_0.81')
with open(str(abspath), 'rb') as pkl:
  v7_results_test = pickle.load(pkl)


#V8 (Bert)
abspath = os.path.abspath('0_Ergebnisse/Bert/results/V8_220702_valid_preds_0.79')
with open(str(abspath), 'rb') as pkl:
  v8_results = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Bert/results/V8_220703_test_preds_0.80')
with open(str(abspath), 'rb') as pkl:
  v8_results_test = pickle.load(pkl)


#V9 (V6 Catboost only BERT embeddings)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V6_220909_all_results')
with open(str(abspath), 'rb') as pkl:
  v9_results = pickle.load(pkl)



#V10 (V7 Catboost text and BERT embeddings)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V7_220909_all_results')
with open(str(abspath), 'rb') as pkl:
  v10_results = pickle.load(pkl)


#V3afs (V3a with just selected features (fs))
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V3afs_220914_all_results')
with open(str(abspath), 'rb') as pkl:
  v3afs_results = pickle.load(pkl)

#V3bfs (V3b with just selected features (fs) and class weights (cw))
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/V8b_220914_all_results')
with open(str(abspath), 'rb') as pkl:
  v3bfscw_results = pickle.load(pkl)


###########################Model design##############################################

#M1 (LCPN) with all features and without class weights

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_220629_test_results')
with open(str(abspath), 'rb') as pkl:
  m1_test = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_220629_valid_results')
with open(str(abspath), 'rb') as pkl:
  m1_valid = pickle.load(pkl)


#M1.2 (LCPN) with all features and class weights (cw)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_cw_220917_test_results')
with open(str(abspath), 'rb') as pkl:
  m1cw_test = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_cw_220917_valid_results')
with open(str(abspath), 'rb') as pkl:
  m1cw_valid = pickle.load(pkl)


#M1.3.1 (LCPN) with selected features (fs)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_220914_test_results')
with open(str(abspath), 'rb') as pkl:
  m1fs_test = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_220914_valid_results')
with open(str(abspath), 'rb') as pkl:
  m1fs_valid = pickle.load(pkl)


#M1.3.2 (LCPN) with four selected features (fs) per submodel
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs4_220916_test_results')
with open(str(abspath), 'rb') as pkl:
  m1fs4_test = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs4_220916_valid_results')
with open(str(abspath), 'rb') as pkl:
  m1fs4_valid = pickle.load(pkl)



#M1.4 (LCPN) with selected features (fs) and class weights (cw)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_cw_220914_test_results')
with open(str(abspath), 'rb') as pkl:
  m1fscw_test = pickle.load(pkl)

abspath = os.path.abspath('0_Ergebnisse/Catboost/results/M1_fs_cw_220914_valid_results')
with open(str(abspath), 'rb') as pkl:
  m1fscw_valid = pickle.load(pkl)


###########################Frequent/rare prediction#########################################

##D1 (Flat) with rare/freq accidents and then freq-model with features (fs)
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/D1_fs_221005_all_results')
with open(str(abspath), 'rb') as pkl:
  d1fs_results = pickle.load(pkl)


###########################Optimization###################################################

#O1
abspath = os.path.abspath('0_Ergebnisse/Catboost/results/O1_220703_all_results')
with open(str(abspath), 'rb') as pkl:
  o1_results = pickle.load(pkl)


### Kappa Score




In [None]:
#Compute Kappa Score

#Validation data
v1_kappa_valid = cohen_kappa_score(v1_results.Y_valid, v1_results.Pred_valid)
v2_kappa_valid = cohen_kappa_score(v2_results.Y_valid, v2_results.Pred_valid)
v3a_kappa_valid = cohen_kappa_score(v3a_results.Y_valid, v3a_results.Pred_valid)
v3b_kappa_valid = cohen_kappa_score(v3b_results.Y_valid, v3b_results.Pred_valid)
v4_kappa_valid = cohen_kappa_score(v4_results.Y_valid, v4_results.Pred_valid)
v5_kappa_valid = cohen_kappa_score(v5_results.Y_valid, v5_results.Pred_valid)
v6_kappa_valid = cohen_kappa_score(v6_results.Y_valid, v6_results.Pred_valid)
v7_kappa_valid = cohen_kappa_score(v7_results.Y_valid, v7_results.Pred_valid)
v8_kappa_valid = cohen_kappa_score(v8_results.Y_valid, v8_results.Pred_valid)

v9_kappa_valid = cohen_kappa_score(v9_results.Y_valid, v9_results.Pred_valid)
v10_kappa_valid = cohen_kappa_score(v10_results.Y_valid, v10_results.Pred_valid)

v3afs_kappa_valid = cohen_kappa_score(v3afs_results.Y_valid, v3afs_results.Pred_valid)
v3bfs_kappa_valid = cohen_kappa_score(v3bfs_results.Y_valid, v3bfs_results.Pred_valid)

m1_kappa_valid = cohen_kappa_score(m1_valid.Y_three, m1_valid.AccidentType)


o1_kappa_valid = cohen_kappa_score(o1_results.Y_valid, o1_results.Pred_valid )
###

kappa_valid = [["V1", v1_kappa_valid], ["V2", v2_kappa_valid], ["V3a", v3a_kappa_valid], ["V3b", v3b_kappa_valid],  ["V4", v4_kappa_valid], ["V5", v5_kappa_valid], ["V6", v6_kappa_valid], ["V7", v7_kappa_valid], ["V8", v8_kappa_valid], ["V9", v9_kappa_valid], ["V10", v10_kappa_valid],["V3afs", v3afs_kappa_valid],["V3bfs", v3bfs_kappa_valid], ["M1", m1_kappa_valid], ["O1", o1_kappa_valid]]
kappa_valid_results = pd.DataFrame(kappa_valid, columns = ["Experiment", "Valid: Cohens_kappa"])

print(kappa_valid_results)

v3a_results[v3a_results['Y_test'].notna()].Y_test

#Test data
v1_kappa_test = cohen_kappa_score(v1_results[v1_results['Y_test'].notna()].Y_test, v1_results[v1_results['Pred_test'].notna()].Pred_test)
v2_kappa_test = cohen_kappa_score(v2_results[v2_results['Y_test'].notna()].Y_test, v2_results[v2_results['Pred_test'].notna()].Pred_test)
v3a_kappa_test = cohen_kappa_score(v3a_results[v3a_results['Y_test'].notna()].Y_test, v3a_results[v3a_results['Pred_test'].notna()].Pred_test)
v3b_kappa_test = cohen_kappa_score(v3b_results[v3b_results['Y_test'].notna()].Y_test, v3b_results[v3b_results['Pred_test'].notna()].Pred_test)
v4_kappa_test = cohen_kappa_score(v4_results[v4_results['Y_test'].notna()].Y_test, v4_results[v4_results['Pred_test'].notna()].Pred_test)
v5_kappa_test = cohen_kappa_score(v5_results[v5_results['Y_test'].notna()].Y_test, v5_results[v5_results['Pred_test'].notna()].Pred_test)
v6_kappa_test = cohen_kappa_score(v6_results_test[v6_results_test['Y_test'].notna()].Y_test, v6_results_test[v6_results_test['Pred_test'].notna()].Pred_test)
v7_kappa_test = cohen_kappa_score(v7_results_test[v7_results_test['Y_test'].notna()].Y_test, v7_results_test[v7_results_test['Pred_test'].notna()].Pred_test)
v8_kappa_test = cohen_kappa_score(v8_results_test[v8_results_test['Y_test'].notna()].Y_test, v8_results_test[v8_results_test['Pred_test'].notna()].Pred_test)

v3afs_kappa_test = cohen_kappa_score(v3afs_results[v3afs_results['Y_test'].notna()].Y_test, v3afs_results[v3afs_results['Pred_test'].notna()].Pred_test)
v3bfs_kappa_test = cohen_kappa_score(v3bfs_results[v3bfs_results['Y_test'].notna()].Y_test, v3bfs_results[v3bfs_results['Pred_test'].notna()].Pred_test)

v9_kappa_test = cohen_kappa_score(v9_results[v9_results['Y_test'].notna()].Y_test, v9_results[v9_results['Pred_test'].notna()].Pred_test)
v10_kappa_test = cohen_kappa_score(v10_results[v10_results['Y_test'].notna()].Y_test, v10_results[v10_results['Pred_test'].notna()].Pred_test)


m1_kappa_test = cohen_kappa_score(m1_test.Y_three, m1_test.AccidentType)


o1_kappa_test = cohen_kappa_score(o1_results[o1_results['Y_test'].notna()].Y_test, o1_results[o1_results['Pred_test'].notna()].Pred_test )
###

kappa_test = [["V1", v1_kappa_test], ["V2", v2_kappa_test], ["V3a", v3a_kappa_test], ["V3b", v3b_kappa_test],  ["V4", v4_kappa_test], ["V5", v5_kappa_test], ["V6", v6_kappa_test], ["V7", v7_kappa_test], ["V8", v8_kappa_test], ["V9", v9_kappa_test], ["V10", v10_kappa_test], ["V3afs", v3afs_kappa_test],["V3bfs", v3bfs_kappa_test], ["M1", m1_kappa_test], ["O1", o1_kappa_test]]
kappa_test_results = pd.DataFrame(kappa_test, columns = ["Experiment", "Test: Cohens_kappa"])

print(kappa_test_results)



### F1 Score

In [None]:
#Get F1-score, accuracy, precision and recall from classification experiments

#Validation data
f1_valid = pd.DataFrame([v1_valid_report.loc["weighted avg", "f1-score"], v2_valid_report.loc["weighted avg", "f1-score"], v3a_valid_report.loc["weighted avg", "f1-score"], v3b_valid_report.loc["weighted avg", "f1-score"], v4_valid_report.loc["weighted avg", "f1-score"], v5_valid_report.loc["weighted avg", "f1-score"], v6_valid_report.loc["weighted avg", "f1-score"], v7_valid_report.loc["weighted avg", "f1-score"], v8_valid_report.loc["weighted avg", "f1-score"], v9_valid_report.loc["weighted avg", "f1-score"], v10_valid_report.loc["weighted avg", "f1-score"],v3afs_valid_report.loc["weighted avg", "f1-score"],v3bfs_valid_report.loc["weighted avg", "f1-score"], m1_valid_report.loc["weighted avg", "f1-score"], o1_valid_report.loc["weighted avg", "f1-score"]])
accuracy_valid = pd.DataFrame([v1_valid_report.loc["accuracy", "support"], v2_valid_report.loc["accuracy", "support"], v3a_valid_report.loc["accuracy", "support"], v3b_valid_report.loc["accuracy", "support"], v4_valid_report.loc["accuracy", "support"], v5_valid_report.loc["accuracy", "support"], v6_valid_report.loc["accuracy", "support"], v7_valid_report.loc["accuracy", "support"], v8_valid_report.loc["accuracy", "support"],v9_valid_report.loc["accuracy", "support"], v10_valid_report.loc["accuracy", "support"],v3afs_valid_report.loc["accuracy", "support"], v3bfs_valid_report.loc["accuracy", "support"], m1_valid_report.loc["accuracy", "support"], o1_valid_report.loc["accuracy", "support"]])
precision_valid = pd.DataFrame([v1_valid_report.loc["weighted avg", "precision"], v2_valid_report.loc["weighted avg", "precision"], v3a_valid_report.loc["weighted avg", "precision"], v3b_valid_report.loc["weighted avg", "precision"], v4_valid_report.loc["weighted avg", "precision"], v5_valid_report.loc["weighted avg", "precision"], v6_valid_report.loc["weighted avg", "precision"], v7_valid_report.loc["weighted avg", "precision"], v8_valid_report.loc["weighted avg", "precision"],v9_valid_report.loc["weighted avg", "precision"], v10_valid_report.loc["weighted avg", "precision"],v3afs_valid_report.loc["weighted avg", "precision"],v3bfs_valid_report.loc["weighted avg", "precision"], m1_valid_report.loc["weighted avg", "precision"], o1_valid_report.loc["weighted avg", "precision"]])
recall_valid = pd.DataFrame([v1_valid_report.loc["weighted avg", "recall"], v2_valid_report.loc["weighted avg", "recall"], v3a_valid_report.loc["weighted avg", "recall"], v3b_valid_report.loc["weighted avg", "recall"], v4_valid_report.loc["weighted avg", "recall"], v5_valid_report.loc["weighted avg", "recall"], v6_valid_report.loc["weighted avg", "recall"], v7_valid_report.loc["weighted avg", "recall"], v8_valid_report.loc["weighted avg", "recall"], v9_valid_report.loc["weighted avg", "recall"], v10_valid_report.loc["weighted avg", "recall"],v3afs_valid_report.loc["weighted avg", "recall"],v3bfs_valid_report.loc["weighted avg", "recall"], m1_valid_report.loc["weighted avg", "recall"], o1_valid_report.loc["weighted avg", "recall"]])

#Test data
f1_test = pd.DataFrame([v1_test_report.loc["weighted avg", "f1-score"], v2_test_report.loc["weighted avg", "f1-score"], v3a_test_report.loc["weighted avg", "f1-score"], v3b_test_report.loc["weighted avg", "f1-score"], v4_test_report.loc["weighted avg", "f1-score"], v5_test_report.loc["weighted avg", "f1-score"], v6_test_report.loc["weighted avg", "f1-score"], v7_test_report.loc["weighted avg", "f1-score"], v8_test_report.loc["weighted avg", "f1-score"],v9_test_report.loc["weighted avg", "f1-score"], v10_test_report.loc["weighted avg", "f1-score"],v3afs_test_report.loc["weighted avg", "f1-score"],v3bfs_test_report.loc["weighted avg", "f1-score"],  m1_test_report.loc["weighted avg", "f1-score"], o1_test_report.loc["weighted avg", "f1-score"]])
accuracy_test = pd.DataFrame([v1_test_report.loc["accuracy", "support"], v2_test_report.loc["accuracy", "support"], v3a_test_report.loc["accuracy", "support"], v3b_test_report.loc["accuracy", "support"], v4_test_report.loc["accuracy", "support"], v5_test_report.loc["accuracy", "support"], v6_test_report.loc["accuracy", "support"], v7_test_report.loc["accuracy", "support"], v8_test_report.loc["accuracy", "support"],v9_test_report.loc["accuracy", "support"], v10_test_report.loc["accuracy", "support"],v3afs_test_report.loc["accuracy", "support"],v3bfs_test_report.loc["accuracy", "support"], m1_test_report.loc["accuracy", "support"], o1_test_report.loc["accuracy", "support"]])
precision_test = pd.DataFrame([v1_test_report.loc["weighted avg", "precision"], v2_test_report.loc["weighted avg", "precision"], v3a_test_report.loc["weighted avg", "precision"], v3b_test_report.loc["weighted avg", "precision"], v4_test_report.loc["weighted avg", "precision"], v5_test_report.loc["weighted avg", "precision"], v6_test_report.loc["weighted avg", "precision"], v7_test_report.loc["weighted avg", "precision"], v8_test_report.loc["weighted avg", "precision"],v9_test_report.loc["weighted avg", "precision"], v10_test_report.loc["weighted avg", "precision"],v3afs_test_report.loc["weighted avg", "precision"],v3bfs_test_report.loc["weighted avg", "precision"],  m1_test_report.loc["weighted avg", "precision"], o1_test_report.loc["weighted avg", "precision"]])
recall_test = pd.DataFrame([v1_test_report.loc["weighted avg", "recall"], v2_test_report.loc["weighted avg", "recall"], v3a_test_report.loc["weighted avg", "recall"], v3b_test_report.loc["weighted avg", "recall"], v4_test_report.loc["weighted avg", "recall"], v5_test_report.loc["weighted avg", "recall"], v6_test_report.loc["weighted avg", "recall"], v7_test_report.loc["weighted avg", "recall"], v8_test_report.loc["weighted avg", "recall"], v9_test_report.loc["weighted avg", "recall"],  v10_test_report.loc["weighted avg", "recall"],v3afs_test_report.loc["weighted avg", "recall"],v3bfs_test_report.loc["weighted avg", "recall"],  m1_test_report.loc["weighted avg", "recall"], o1_test_report.loc["weighted avg", "recall"]])






#Combine with kappa results
results = kappa_test_results.copy()
results.loc[:, "Test: F1_weighted_avg"] = f1_test
results.loc[:, "Test: Accuracy"] = accuracy_test
results.loc[:, "Test: Precision"] = precision_test
results.loc[:, "Test: Recall"] = recall_test

results.loc[:, "Valid: Cohens_kappa"] = kappa_valid_results.iloc[:, 1]
results.loc[:, "Valid: F1_weighted_avg"] = f1_valid
results.loc[:, "Valid: Accuracy"] = accuracy_valid
results.loc[:, "Valid: Precision"] = precision_valid
results.loc[:, "Valid: Recall"] = recall_valid

print(results)



In [None]:
#Save results
results_v1_o1_all = results.copy()

results_v1_o1_all.to_pickle('0_Ergebnisse/Overall/V1_O1_220914_all_results')
results_v1_o1_all.to_csv('0_Ergebnisse/Overall/V1_O1_220914_all_results.csv')


### Number of predicted classes

In [None]:
#Function to get predicted classes and samples and the unpredicted ones.


def analyze_predictions(report, experiment, datatype):

  test =  report.copy()
  #Drop last 3 rows (accuracy, etc.)
  test = test.iloc[:-3]
  #print rows, where recall is zero and support ist not zero (otherwise the class does not exist in the dataset)
  zero = test[(test["recall"] == 0.0) & (test["support"] != 0)]
  notzero = test[(test["recall"] != 0.0) & (test["support"] != 0)]

  #unpredicted samples
  unpredicted_samples = zero['support'].sum()
  unpredicted_samples

  #unpredictec classes: classes which were never predicted, although they existed
  unpredicted_classes = zero.index.values
  unpredicted_classes_sum = len(unpredicted_classes)

  #predicted classes
  predicted_classes = notzero.index.values
  predicted_classes_sum = len(predicted_classes)

  #predicted classes with at least one correctly predicted sample
  predicted_classes = len(test[test["recall"] != 0.0])

  #Make data frame
  df = pd.DataFrame([[experiment, datatype, predicted_classes_sum, unpredicted_classes_sum, unpredicted_samples]], columns = ['Experiment','Data','PredictedClasses', 'UnpredictedClasses', 'UnpredictedSamples'])
  #print(df)

  #Return
  return df




In [None]:
#Analyze valid reports

v1_analyze_valid = analyze_predictions(v1_valid_report, experiment = "V1", datatype = "Valid")
v2_analyze_valid = analyze_predictions(v2_valid_report, experiment = "V2", datatype = "Valid")
v3a_analyze_valid = analyze_predictions(v3a_valid_report, experiment = "V3a", datatype = "Valid")
v3b_analyze_valid = analyze_predictions(v3b_valid_report, experiment = "V3b", datatype = "Valid")
v4_analyze_valid = analyze_predictions(v4_valid_report, experiment = "V4", datatype = "Valid")
v5_analyze_valid = analyze_predictions(v5_valid_report, experiment = "V5", datatype = "Valid")
v6_analyze_valid = analyze_predictions(v6_valid_report, experiment = "V6", datatype = "Valid")
v7_analyze_valid = analyze_predictions(v7_valid_report, experiment = "V7", datatype = "Valid")
v8_analyze_valid = analyze_predictions(v8_valid_report, experiment = "V8", datatype = "Valid")
v9_analyze_valid = analyze_predictions(v9_valid_report, experiment = "V9", datatype = "Valid")
v10_analyze_valid = analyze_predictions(v10_valid_report, experiment = "V10", datatype = "Valid")
v3afs_analyze_valid = analyze_predictions(v3afs_valid_report, experiment = "V3afs", datatype = "Valid") #with feature selection
v3bfs_analyze_valid = analyze_predictions(v3bfs_valid_report, experiment = "V3bfs", datatype = "Valid") #with feature selection
m1_analyze_valid = analyze_predictions(m1_valid_report, experiment = "M1fs", datatype = "Valid") #with feature selection
o1_analyze_valid = analyze_predictions(o1_valid_report, experiment = "O1", datatype = "Valid")

report_analyze_valid = pd.concat([v1_analyze_valid, v2_analyze_valid, v3a_analyze_valid, v3b_analyze_valid, v4_analyze_valid, v5_analyze_valid, v6_analyze_valid, v7_analyze_valid, v8_analyze_valid,v9_analyze_valid, v10_analyze_valid,v3afs_analyze_valid,v3bfs_analyze_valid, m1_analyze_valid, o1_analyze_valid], axis = 0)
print(report_analyze_valid)


#Analyze test reports

v1_analyze_test = analyze_predictions(v1_test_report, experiment = "V1", datatype = "Test")
v2_analyze_test = analyze_predictions(v2_test_report, experiment = "V2", datatype = "Test")
v3a_analyze_test = analyze_predictions(v3a_test_report, experiment = "V3a", datatype = "Test")
v3b_analyze_test = analyze_predictions(v3b_test_report, experiment = "V3b", datatype = "Test")
v4_analyze_test = analyze_predictions(v4_test_report, experiment = "V4", datatype = "Test")
v5_analyze_test = analyze_predictions(v5_test_report, experiment = "V5", datatype = "Test")
v6_analyze_test = analyze_predictions(v6_test_report, experiment = "V6", datatype = "Test")
v7_analyze_test = analyze_predictions(v7_test_report, experiment = "V7", datatype = "Test")
v8_analyze_test = analyze_predictions(v8_test_report, experiment = "V8", datatype = "Test")
v9_analyze_test = analyze_predictions(v9_test_report, experiment = "V9", datatype = "Test")
v10_analyze_test = analyze_predictions(v10_test_report, experiment = "V10", datatype = "Test")
v3afs_analyze_test = analyze_predictions(v3afs_test_report, experiment = "V3afs", datatype = "Test") #with feature selection
v3bfs_analyze_test = analyze_predictions(v3bfs_test_report, experiment = "V3bfs", datatype = "Test") #with feature selection
m1_analyze_test = analyze_predictions(m1_test_report, experiment = "M1fs", datatype = "Test") #with feature selection
o1_analyze_test = analyze_predictions(o1_test_report, experiment = "O1", datatype = "Test")

report_analyze_test = pd.concat([v1_analyze_test, v2_analyze_test, v3a_analyze_test, v3b_analyze_test, v4_analyze_test, v5_analyze_test, v6_analyze_test, v7_analyze_test, v8_analyze_test,v9_analyze_test,v10_analyze_test,v3afs_analyze_test,v3bfs_analyze_test, m1_analyze_test, o1_analyze_test], axis = 0)
print(report_analyze_test)

#Bind both reports
report_analyze = pd.concat([report_analyze_test, report_analyze_valid.iloc[:, 1:]], axis = 1)
report_analyze


In [None]:
#Save

report_analyze.to_pickle('0_Ergebnisse/Overall/V1_O1_220914_report_analyze')
report_analyze.to_csv('0_Ergebnisse/Overall/V1_O1_220914_report_analyze.csv')

In [None]:
v3bfscw_valid_report

In [None]:
v3bfscw_analyze_valid = analyze_predictions(v3bfscw_valid_report, experiment = "C2.3", datatype = "Valid")

print(v3bfscw_analyze_valid)

In [None]:
v3bfscw_analyze_test = analyze_predictions(v3bfscw_test_report, experiment = "C2.3", datatype = "Test")

print(v3bfscw_analyze_test)

### Confusion matrices

In [None]:
# Prepare data for export (matrices are plotted in R)

def dataexportvalid_cm(results, classlist):

  #Get data from results
  true = results[results['Y_valid'].notna()].Y_valid.astype(int)
  pred = results[results["Pred_valid"].notna()].Pred_valid.astype(int)

  #Map true accident types
  true_classlist = pd.DataFrame({"TrueClasses": classlist})
  true_classlist["Y_valid"] = true_classlist.index

  true_classes = pd.merge(true, true_classlist, on = "Y_valid", how = "left")

  #Map pred accident types
  pred_classlist = pd.DataFrame({"PredClasses": classlist})
  pred_classlist["Pred_valid"] = pred_classlist.index

  pred_classes = pd.merge(pred, pred_classlist, on = "Pred_valid", how = "left")

  #Combine to final dataframe
  export = pd.concat([true_classes, pred_classes], axis = 1)
  print(export)

  #Return
  return export


Create export data

In [None]:
v3a_valid_preds = dataexportvalid_cm(v3a_results, classes_list)

In [None]:
v3b_valid_preds = dataexportvalid_cm(v3b_results, classes_list)

In [None]:
d1fs_valid_preds = dataexportvalid_cm(d1fs_results, classes_list_freq)

Save export data

In [None]:
#Save for creating diagrams in R with ggplot2 (optional)

#B1.4
v3a_valid_preds.to_csv('0_Ergebnisse/Overall/V3a_B1.4_220708_valid_preds.csv')

#C2.1
v3b_valid_preds.to_csv('0_Ergebnisse/Overall/V3b_C2.1_220629_valid_preds.csv')


#D1
d1fs_valid_preds.to_csv('0_Ergebnisse/Overall/D1fs_D1_221005_valid_preds.csv')

### WordCloud

#### Overall word cloud

In [None]:
!pip install wordcloud

In [None]:
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt



In [None]:
#Combine single files to create wordcloud over the whole data without distinguishing between train, test and valid data
overall_data = pd.concat([train_bbgcpure_three, test_bbgcpure_three, valid_bbgcpure_three], axis = 0)


In [None]:
#train_bbgcpure_three.Description
#Prepare data for wordcloud

text = overall_data.Description
textnp = train_bbgcpure_three.Description.to_numpy() #Length: 5198
#textnpcop = textnp.flatten().copy()

#Convert numpy to string
string = np.array2string(textnp, threshold = np.inf) #3608
#split string into single words
word_list = string.split()


In [None]:
#Not interesting words
remove = "die der das und den dem des Die Der Den Dem Des Das ihrem seinem mehr es Es kam wird wurde ein "
remove_list = remove.split()


#Update stopwords
STOPWORDS.update(remove_list)

#Constraint to 100 words without stopwords
wordcloud = WordCloud(relative_scaling = 0.5, colormap = "gray", background_color = "white", normalize_plurals = False, width =1600, height = 800, max_words = 100).generate(string)

plt.figure(figsize=(20,10), facecolor='k')
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
#plt.savefig('0_Ergebnisse/Overall/220713_Wordcloud_traindata.png')

In [None]:
#Get most frequent words from word list

#Apply counter
word_count = Counter(word_list)

#Get most frequent words
most_frequent = word_count.most_common(100)
most_frequent

#### Words: 201 | 222 | 231

201 = Label 0 |
222 = Label 12 |
231 = Label 17 |

In [None]:
#train_bbgcpure_three.Description
#Prepare data for wordcloud

text_train_201 = train_bbgcpure_three[train_bbgcpure_three['AccidentType'] == 0].Description
text_train_222 = train_bbgcpure_three[train_bbgcpure_three['AccidentType'] == 12].Description
text_train_221 = train_bbgcpure_three[train_bbgcpure_three['AccidentType'] == 11].Description
text_train_231 = train_bbgcpure_three[train_bbgcpure_three['AccidentType'] == 17].Description

#Get text
textnp_201 = text_train_201.to_numpy()
textnp_221 = text_train_221.to_numpy()
textnp_222 = text_train_222.to_numpy()
textnp_231 = text_train_231.to_numpy()

#Convert numpy to string
string_201 = np.array2string(textnp_201, threshold = np.inf)
string_221 = np.array2string(textnp_221, threshold = np.inf)
string_222 = np.array2string(textnp_222, threshold = np.inf)
string_231 = np.array2string(textnp_231, threshold = np.inf)


#split string into single words
wordlist_201 = string_201.split()
wordlist_221 = string_221.split()
wordlist_222 = string_222.split()
wordlist_231 = string_231.split()

In [None]:
#Remove Stopwords

stopwords = "die der das und den dem des Die Der Den Dem Des Das ihrem seinem mehr es Es kam wird wurde ein Beteiligter"
stopword_list = stopwords.split()


def remove_stopwords(wordlist, stopword_list):

  listcopy = wordlist.copy()

  for stopword in stopword_list:

    while stopword in listcopy:listcopy.remove(stopword)

  return listcopy


wordlistclean_201 = remove_stopwords(wordlist_201, stopword_list)
wordlistclean_221 = remove_stopwords(wordlist_221, stopword_list)
wordlistclean_222 = remove_stopwords(wordlist_222, stopword_list)
wordlistclean_231 = remove_stopwords(wordlist_231, stopword_list)

In [None]:
#Count
#Apply counter
counter_201 = Counter(wordlistclean_201)
print(counter_201)
counter_221 = Counter(wordlistclean_221)
print(counter_221)
counter_222 = Counter(wordlistclean_222)
print(counter_222)
counter_231 = Counter(wordlistclean_231)
print(counter_231)


In [None]:
#Get most frequent words

freq = 17 #Number of most frequent words

most_201 = counter_201.most_common(freq)
print(most_201)
most_221 = counter_221.most_common(freq)
print(most_221)
most_222 = counter_222.most_common(freq)
print(most_222)
most_231 = counter_231.most_common(freq)
print(most_231)

In [None]:
#Convert to Dataframes

mostpd_201 = pd.DataFrame.from_records(most_201, columns = ['Word201', 'Freq201'])
mostpd_221 = pd.DataFrame.from_records(most_221, columns = ['Word221', 'Freq221'])
mostpd_222 = pd.DataFrame.from_records(most_222, columns = ['Word222', 'Freq222'])
mostpd_231 = pd.DataFrame.from_records(most_231, columns = ['Word231', 'Freq231'])

most_all = pd.concat([mostpd_201, mostpd_221, mostpd_222, mostpd_231], axis = 1)
print(most_all)

In [None]:
#Save all results
most_all.to_csv(('0_Ergebnisse/Overall/17MostFrequentWords_201_221_222_231_trainingdata.csv'))

In [None]:
freqlist_201 = set(list(mostpd_201.Word201))
freqlist_221 = set(list(mostpd_221.Word221))
freqlist_222 = set(list(mostpd_222.Word222))
freqlist_231 = set(list(mostpd_231.Word231))

#Intersection of list 222 and 231
int201_231 = list(freqlist_201.intersection(freqlist_231))
print(int201_231)
int222_231 = list(freqlist_222.intersection(freqlist_231))
print(int222_231)
int221_222 = list(freqlist_221.intersection(freqlist_222))
print(int221_222)

#Difference between the lists
diff201_231 = freqlist_201^freqlist_231
#print(diff201_231)
diff222_231 = freqlist_222^freqlist_231
#print(diff222_231)
diff221_222 = freqlist_221^freqlist_222
print("diff221_222")
print(diff221_222)