# Civil cases

In [1]:
import os
import datetime
import re
import operator

import numpy as np
import pandas as pd

from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 120)
pd.set_option('display.max_columns', 200)

### Reading & parsing CSV

In [3]:
data_dir = "data"
file_name = "Izskatitas_C1_Kat3_2017_lietas_07.csv"
data_source_path = os.path.join(data_dir, file_name)

In [4]:
df = pd.read_csv(data_source_path, 
                 parse_dates=["Ienākšanas datums", "Izskatīšanas datums"],
                 dtype={"Dzimšanas datums": str},
                 index_col="Lietas ID (caseID)")

In [5]:
def make_float(x):
    try:
        return float(x)
    except ValueError:
        return np.nan

def fix_century(x):
    if x > datetime.datetime(2000, 1, 1):
        return x - relativedelta(years=100)
    else:
        return x

In [6]:
df["Prasījuma apmērs"] = df.apply(lambda item: make_float(item["Prasījuma apmērs"]), axis=1)

df["Būtība"] = df["Būtība"].astype(str)
df["Tiesas ID (courtID)"] = df["Tiesas ID (courtID)"].astype(str)

df["Dzimšanas datums"] = pd.to_datetime(df["Dzimšanas datums"], format="%d%m%y", errors="coerce")
df["Dzimšanas datums"] = df.apply(lambda item: fix_century(item["Dzimšanas datums"]), axis=1)

### Overall stats

In [7]:
df.shape

(16455, 17)

In [8]:
df.head()

Unnamed: 0_level_0,Tiesas ID (courtID),Tiesa,Dzimšanas datums,Ienākšanas kārtība,Ienākšanas datums,Izskatīšanas datums,Būtība,Prasījuma apmērs,atb_fiz,atb_jur,pras_fiz,pras_jur,Nozīmētās sēdes,Notikušās sēdes,naudaS_sods,dzivo_arzemes,atbr_no_nodevas
Lietas ID (caseID),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
213477,40,Valkas rajona tiesa,1959-01-06,Pirmo reizi,2003-06-05,2017-01-23,parāda piedziņa,0.0,0,1,0,1,3,0,0,1,0
448971,32,Rīgas pilsētas Vidzemes priekšpilsētas tiesa,1967-07-05,Pirmo reizi,2005-11-29,2017-04-06,zaudējumu piedziņa,0.0,1,0,1,0,9,1,0,0,1
477470,279,Rīgas pilsētas Pārdaugavas tiesa,1951-05-01,Sakarā ar tiesu apvienošanu,2017-01-31,2017-10-02,parāda piedziņa,21414.34,2,0,1,0,5,3,0,0,0
575613,27,Liepājas tiesa,1963-04-03,Pirmo reizi,2007-06-04,2017-10-26,zaudējumu piedziņa,916.25,6,0,2,0,4,0,0,0,0
580131,32,Rīgas pilsētas Vidzemes priekšpilsētas tiesa,1974-01-25,Pēc piekritības no citas tiesas,2015-10-07,2017-11-13,parāda piedziņa,1588192.95,4,0,1,0,8,1,0,0,0


In [9]:
df.dtypes

Tiesas ID (courtID)            object
Tiesa                          object
Dzimšanas datums       datetime64[ns]
Ienākšanas kārtība             object
Ienākšanas datums      datetime64[ns]
Izskatīšanas datums    datetime64[ns]
Būtība                         object
Prasījuma apmērs              float64
atb_fiz                         int64
atb_jur                         int64
pras_fiz                        int64
pras_jur                        int64
Nozīmētās sēdes                 int64
Notikušās sēdes                 int64
naudaS_sods                     int64
dzivo_arzemes                   int64
atbr_no_nodevas                 int64
dtype: object

In [10]:
df.isnull().sum()

Tiesas ID (courtID)    0
Tiesa                  0
Dzimšanas datums       0
Ienākšanas kārtība     0
Ienākšanas datums      0
Izskatīšanas datums    0
Būtība                 0
Prasījuma apmērs       9
atb_fiz                0
atb_jur                0
pras_fiz               0
pras_jur               0
Nozīmētās sēdes        0
Notikušās sēdes        0
naudaS_sods            0
dzivo_arzemes          0
atbr_no_nodevas        0
dtype: int64

In [11]:
def categorical_stats(df):
    values_df = df.select_dtypes(include="object").nunique().to_frame(name="unique")
    values_df["values"] = values_df.apply(lambda item: df[item.name].unique().tolist(), axis=1)
    return values_df

In [12]:
categorical_stats(df)

Unnamed: 0,unique,values
Tiesas ID (courtID),30,"[40, 32, 279, 27, 25, 30, 26, 4, 5, 29, 34, 273, 19, 39, 23, 6, 16, 28, 1, 38, 10, 18, 2, 33, 8, 22, 7, 31, 37, 284]"
Tiesa,30,"[Valkas rajona tiesa, Rīgas pilsētas Vidzemes priekšpilsētas tiesa, Rīgas pilsētas Pārdaugavas tiesa, Liepājas tiesa..."
Ienākšanas kārtība,6,"[Pirmo reizi, Sakarā ar tiesu apvienošanu, Pēc piekritības no citas tiesas, Pēc lēmuma atcelšanas, CPL 32.1 panta kā..."
Būtība,1254,"[parāda piedziņa, zaudējumu piedziņa, par rokas naudas piedziņu, par parāda piedziņu, parāda piedziņa, prasības nodr..."


### Cleanup

In [13]:
df.dropna(subset=["Prasījuma apmērs"], inplace=True)
df.drop(labels=["Tiesa"], axis=1, inplace=True)

### Transformations

In [14]:
def escape_chars(text, chars, mask=" "):
    return text.translate({ord(c): mask for c in chars})


def escape_punctuation(text, mask=" "):
    punctuation = ".,;-()/'"
    return escape_chars(text, punctuation, mask=mask)


def escape_digits(text, mask=" "):
    digits = "0123456789"
    return escape_chars(text, digits, mask=mask)


def cleanup_text(x):
    x = x.lower()
    x = escape_punctuation(x)
    x = escape_digits(x)
    x = " ".join(x.split())
    return x

In [15]:
df["Būtība"] = df.apply(lambda item: cleanup_text(item["Būtība"]), axis=1)

In [16]:
categorical_stats(df)

Unnamed: 0,unique,values
Tiesas ID (courtID),30,"[40, 32, 279, 27, 25, 30, 26, 4, 5, 29, 34, 273, 19, 39, 23, 6, 16, 28, 1, 38, 10, 18, 2, 33, 8, 22, 7, 31, 37, 284]"
Ienākšanas kārtība,6,"[Pirmo reizi, Sakarā ar tiesu apvienošanu, Pēc piekritības no citas tiesas, Pēc lēmuma atcelšanas, CPL 32.1 panta kā..."
Būtība,1056,"[parāda piedziņa, zaudējumu piedziņa, par rokas naudas piedziņu, par parāda piedziņu, parāda piedziņa prasības nodro..."


In [17]:
# text cleanup reduces unique case definitions count from 1254 to 1056

In [18]:
df.rename({"atbr_no_nodevas": "atbr_no_nodevas_known"}, axis=1, inplace=True)

In [19]:
df["age"] = df.apply(lambda item: relativedelta(item["Ienākšanas datums"], item["Dzimšanas datums"]).years, axis=1)
df.drop(labels=["Dzimšanas datums"], axis=1, inplace=True)

In [20]:
df["creation_y"] = df.apply(lambda item: item["Ienākšanas datums"].year, axis=1)
df["creation_m"] = df.apply(lambda item: item["Ienākšanas datums"].month, axis=1)
df["creation_d"] = df.apply(lambda item: item["Ienākšanas datums"].day, axis=1)

df["review_y"] = df.apply(lambda item: item["Izskatīšanas datums"].year, axis=1)
df["review_m"] = df.apply(lambda item: item["Izskatīšanas datums"].month, axis=1)
df["review_d"] = df.apply(lambda item: item["Izskatīšanas datums"].day, axis=1)

In [21]:
def difference_in_days(d1, d2):
    rd = relativedelta(d1, d2)
    now = datetime.datetime.now()
    then = now - rd
    diff = now - then
    return diff.days

In [22]:
df["case_lifetime_days"] = df.apply(
    lambda item: difference_in_days(item["Izskatīšanas datums"], item["Ienākšanas datums"]), axis=1)

In [23]:
df.drop(labels=["Ienākšanas datums", "Izskatīšanas datums"], axis=1, inplace=True)

In [24]:
df.head()

Unnamed: 0_level_0,Tiesas ID (courtID),Ienākšanas kārtība,Būtība,Prasījuma apmērs,atb_fiz,atb_jur,pras_fiz,pras_jur,Nozīmētās sēdes,Notikušās sēdes,naudaS_sods,dzivo_arzemes,atbr_no_nodevas_known,age,creation_y,creation_m,creation_d,review_y,review_m,review_d,case_lifetime_days
Lietas ID (caseID),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
213477,40,Pirmo reizi,parāda piedziņa,0.0,0,1,0,1,3,0,0,1,0,44,2003,6,5,2017,1,23,4978
448971,32,Pirmo reizi,zaudējumu piedziņa,0.0,1,0,1,0,9,1,0,0,1,38,2005,11,29,2017,4,6,4146
477470,279,Sakarā ar tiesu apvienošanu,parāda piedziņa,21414.34,2,0,1,0,5,3,0,0,0,65,2017,1,31,2017,10,2,245
575613,27,Pirmo reizi,zaudējumu piedziņa,916.25,6,0,2,0,4,0,0,0,0,44,2007,6,4,2017,10,26,3795
580131,32,Pēc piekritības no citas tiesas,parāda piedziņa,1588192.95,4,0,1,0,8,1,0,0,0,41,2015,10,7,2017,11,13,767


In [25]:
print("dataset dimensions now: {}".format(df.shape))

dataset dimensions dimensions now: (16446, 21)


In [26]:
court_id_dummy = pd.get_dummies(df["Tiesas ID (courtID)"], prefix="courtid", prefix_sep="_")
entrance_dummy = pd.get_dummies(df["Ienākšanas kārtība"], prefix="entrance", prefix_sep="_")

In [27]:
df = pd.concat([df, court_id_dummy, entrance_dummy], axis=1)

In [28]:
df.drop(labels=["Tiesas ID (courtID)", "Ienākšanas kārtība"], axis=1, inplace=True)

In [29]:
df.head()

Unnamed: 0_level_0,Būtība,Prasījuma apmērs,atb_fiz,atb_jur,pras_fiz,pras_jur,Nozīmētās sēdes,Notikušās sēdes,naudaS_sods,dzivo_arzemes,atbr_no_nodevas_known,age,creation_y,creation_m,creation_d,review_y,review_m,review_d,case_lifetime_days,courtid_1,courtid_10,courtid_16,courtid_18,courtid_19,courtid_2,courtid_22,courtid_23,courtid_25,courtid_26,courtid_27,courtid_273,courtid_279,courtid_28,courtid_284,courtid_29,courtid_30,courtid_31,courtid_32,courtid_33,courtid_34,courtid_37,courtid_38,courtid_39,courtid_4,courtid_40,courtid_5,courtid_6,courtid_7,courtid_8,entrance_CPL 32.1 panta kārtībā (ātrākas izskatīšanas nodrošināšanai),entrance_Lietas atdalīšana,entrance_Pirmo reizi,entrance_Pēc lēmuma atcelšanas,entrance_Pēc piekritības no citas tiesas,entrance_Sakarā ar tiesu apvienošanu
Lietas ID (caseID),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
213477,parāda piedziņa,0.0,0,1,0,1,3,0,0,1,0,44,2003,6,5,2017,1,23,4978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
448971,zaudējumu piedziņa,0.0,1,0,1,0,9,1,0,0,1,38,2005,11,29,2017,4,6,4146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
477470,parāda piedziņa,21414.34,2,0,1,0,5,3,0,0,0,65,2017,1,31,2017,10,2,245,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
575613,zaudējumu piedziņa,916.25,6,0,2,0,4,0,0,0,0,44,2007,6,4,2017,10,26,3795,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
580131,parāda piedziņa,1588192.95,4,0,1,0,8,1,0,0,0,41,2015,10,7,2017,11,13,767,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [30]:
print("dataset dimensions now: {}".format(df.shape))

dataset dimensions dimensions now: (16446, 55)


### Last step of preprocessing, common for two further ML approaches

In [31]:
def list_unique_words(sentences):
    return set().union(*sentences)

In [32]:
unique_words = list_unique_words([x.split() for x in df["Būtība"].unique().tolist()])

In [33]:
print("unique words: {}".format(len(unique_words)))

unique words: 671


In [34]:
short_words = [x for x in unique_words if len(x) <= 3]

In [35]:
print("short words: {}".format(len(short_words)))
print(short_words)

short words: 28
['ls', 'm', 'arī', 'maz', 'tā', 'bez', 'bo', 'un', 'kā', 'deļ', 'k', 'to', 'nan', 'apr', 'par', 'vai', 'dēļ', 'no', 'kas', 'pār', 'pa', 'nar', 'uz', 'eur', 'tās', 'ar', 'lvl', 'uin']


In [36]:
meaningful_short_words = ["uin", "ls", "lvl", "eur"]

In [37]:
def escape_words(text, words):
    return " ".join([x for x in text.split() if x not in words])


def escape_short_words(text):
    return escape_words(text, set(short_words) - set(meaningful_short_words))

In [38]:
# escaping prepositions
df["Būtība"] = df.apply(lambda item: escape_short_words(item["Būtība"]), axis=1)

In [39]:
df.head()

Unnamed: 0_level_0,Būtība,Prasījuma apmērs,atb_fiz,atb_jur,pras_fiz,pras_jur,Nozīmētās sēdes,Notikušās sēdes,naudaS_sods,dzivo_arzemes,atbr_no_nodevas_known,age,creation_y,creation_m,creation_d,review_y,review_m,review_d,case_lifetime_days,courtid_1,courtid_10,courtid_16,courtid_18,courtid_19,courtid_2,courtid_22,courtid_23,courtid_25,courtid_26,courtid_27,courtid_273,courtid_279,courtid_28,courtid_284,courtid_29,courtid_30,courtid_31,courtid_32,courtid_33,courtid_34,courtid_37,courtid_38,courtid_39,courtid_4,courtid_40,courtid_5,courtid_6,courtid_7,courtid_8,entrance_CPL 32.1 panta kārtībā (ātrākas izskatīšanas nodrošināšanai),entrance_Lietas atdalīšana,entrance_Pirmo reizi,entrance_Pēc lēmuma atcelšanas,entrance_Pēc piekritības no citas tiesas,entrance_Sakarā ar tiesu apvienošanu
Lietas ID (caseID),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
213477,parāda piedziņa,0.0,0,1,0,1,3,0,0,1,0,44,2003,6,5,2017,1,23,4978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
448971,zaudējumu piedziņa,0.0,1,0,1,0,9,1,0,0,1,38,2005,11,29,2017,4,6,4146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
477470,parāda piedziņa,21414.34,2,0,1,0,5,3,0,0,0,65,2017,1,31,2017,10,2,245,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
575613,zaudējumu piedziņa,916.25,6,0,2,0,4,0,0,0,0,44,2007,6,4,2017,10,26,3795,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
580131,parāda piedziņa,1588192.95,4,0,1,0,8,1,0,0,0,41,2015,10,7,2017,11,13,767,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [40]:
categorical_stats(df)

Unnamed: 0,unique,values
Būtība,986,"[parāda piedziņa, zaudējumu piedziņa, rokas naudas piedziņu, parāda piedziņu, parāda piedziņa prasības nodrošinājums..."


In [41]:
# preposition escaping reduces unique definitions count to 986

In [42]:
endings = [
    "ajām$", "ajiem$", "ajā$", "iem$", 
    "as$", "ās$", "ai$", "es$", "am$", "ām$", "is$", "us$", 
    "a$", "ā$", "u$", "s$", "o$", "i$"
]


def stem_word(word):
    for ending in endings:
        stemmed = re.sub(ending, "", word)
        if stemmed != word:
            return stemmed
    return word


def stem_text(text):
    return " ".join([stem_word(x) for x in text.split()])

In [43]:
# home made stemming
df["Būtība"] = df.apply(lambda item: stem_text(item["Būtība"]), axis=1)

In [44]:
df.head()

Unnamed: 0_level_0,Būtība,Prasījuma apmērs,atb_fiz,atb_jur,pras_fiz,pras_jur,Nozīmētās sēdes,Notikušās sēdes,naudaS_sods,dzivo_arzemes,atbr_no_nodevas_known,age,creation_y,creation_m,creation_d,review_y,review_m,review_d,case_lifetime_days,courtid_1,courtid_10,courtid_16,courtid_18,courtid_19,courtid_2,courtid_22,courtid_23,courtid_25,courtid_26,courtid_27,courtid_273,courtid_279,courtid_28,courtid_284,courtid_29,courtid_30,courtid_31,courtid_32,courtid_33,courtid_34,courtid_37,courtid_38,courtid_39,courtid_4,courtid_40,courtid_5,courtid_6,courtid_7,courtid_8,entrance_CPL 32.1 panta kārtībā (ātrākas izskatīšanas nodrošināšanai),entrance_Lietas atdalīšana,entrance_Pirmo reizi,entrance_Pēc lēmuma atcelšanas,entrance_Pēc piekritības no citas tiesas,entrance_Sakarā ar tiesu apvienošanu
Lietas ID (caseID),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
213477,parād piedziņ,0.0,0,1,0,1,3,0,0,1,0,44,2003,6,5,2017,1,23,4978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
448971,zaudējum piedziņ,0.0,1,0,1,0,9,1,0,0,1,38,2005,11,29,2017,4,6,4146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
477470,parād piedziņ,21414.34,2,0,1,0,5,3,0,0,0,65,2017,1,31,2017,10,2,245,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
575613,zaudējum piedziņ,916.25,6,0,2,0,4,0,0,0,0,44,2007,6,4,2017,10,26,3795,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
580131,parād piedziņ,1588192.95,4,0,1,0,8,1,0,0,0,41,2015,10,7,2017,11,13,767,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [45]:
categorical_stats(df)

Unnamed: 0,unique,values
Būtība,836,"[parād piedziņ, zaudējum piedziņ, rok naud piedziņ, parād piedziņ prasīb nodrošinājum, mantisk tiesīb atzīšan spēk n..."


In [46]:
# stemming reduces unique definitions count to 836

In [47]:
unique_words = list_unique_words([x.split() for x in df["Būtība"].unique().tolist()])

In [48]:
print("unique words now: {}".format(len(unique_words)))

unique words now: 471


### Method 1. Case definition as categorical value

In [49]:
backup_df = df.copy()

In [50]:
case_def_dummy = pd.get_dummies(df["Būtība"], prefix="casedef", prefix_sep="_")
df = pd.concat([df, case_def_dummy], axis=1)
df.drop(labels=["Būtība"], axis=1, inplace=True)

In [51]:
df.shape

(16446, 890)

In [52]:
def set_target_last(df, target):
    cols = df.columns.values.tolist()[:]
    cols.remove(target);
    return df[cols + [target]]

In [53]:
df = set_target_last(df, target="case_lifetime_days")

In [54]:
X = df.values[:, :-1]
y = df.values[:, -1]

In [55]:
print(X.shape)
print(y.shape)

(16446, 889)
(16446,)


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [57]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11512, 889)
(4934, 889)
(11512,)
(4934,)


In [58]:
rf = RandomForestRegressor(random_state=0)

In [59]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [60]:
def sum_values_by_prefix(d, prefix):
    total = sum([v for k, v in d.items() if k.startswith(prefix + "_")])
    clean_d = {k: v for k, v in d.items() if not k.startswith(prefix + "_")}
    clean_d[prefix] = total
    return clean_d

In [61]:
feature_names = df.columns.values.tolist()[:-1]
feature_importances = dict(zip(feature_names, rf.feature_importances_))

feature_importances = sum_values_by_prefix(feature_importances, prefix="courtid")
feature_importances = sum_values_by_prefix(feature_importances, prefix="entrance")
feature_importances = sum_values_by_prefix(feature_importances, prefix="casedef")

sorted_fe = sorted(feature_importances.items(), key=operator.itemgetter(1), reverse=True)

print("Feature importances:")

for name, value in sorted_fe:
    print("{:.4f}\t {}".format(value, name))

Feature importances:
0.8558	 creation_y
0.0606	 review_m
0.0591	 creation_m
0.0206	 Nozīmētās sēdes
0.0010	 creation_d
0.0009	 review_d
0.0007	 courtid
0.0003	 Prasījuma apmērs
0.0002	 Notikušās sēdes
0.0001	 age
0.0001	 pras_jur
0.0001	 pras_fiz
0.0001	 casedef
0.0001	 atb_jur
0.0001	 entrance
0.0000	 atb_fiz
0.0000	 dzivo_arzemes
0.0000	 atbr_no_nodevas_known
0.0000	 naudaS_sods
0.0000	 review_y


In [62]:
y_pred = rf.predict(X_test)

In [63]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mre = np.mean(np.absolute(y_test - y_pred) / y_test)

In [64]:
print("Errors")
print("MRE: {:.3f}".format(mre))
print("MAE: {:.3f}".format(mae))
print("MSE: {:.3f}".format(mse))

Errors
MRE: 0.021
MAE: 4.574
MSE: 173.063


#### Prediction example

In [65]:
n_ex = 10
pred_ex_df = pd.DataFrame(data=list(zip(y_test[:n_ex], y_pred[:n_ex])), columns=["Test", "Predicted"])

In [66]:
pred_ex_df

Unnamed: 0,Test,Predicted
0,70.0,70.3
1,132.0,133.5
2,107.0,105.9
3,120.0,121.7
4,416.0,424.0
5,58.0,59.0
6,67.0,66.6
7,139.0,138.5
8,140.0,139.7
9,99.0,98.2


### Method 2. NLP: case definition as bag of words

In [67]:
df = backup_df.copy()