In [None]:
# Libraries import
import pandas as pd
from io import StringIO
from scipy import stats
import numpy as np

during loading data, I encountered issue with too many commas in some lines, therefore I clean double commas and commas on the end of the line, if there is more than 13 values in the row.

In [2]:
# Data load
# Open the file and clean commas
cleaned_lines = []
with open("modelowanie_pricing_EH.csv", "r") as f:
    for line in f:
        if len(line.split(",")) == 13:
            cleaned_lines.append(line)
        else:
            cleaned_lines.append(line.replace(",,",",").rstrip(",\n"))

# Convert cleaned data into a StringIO object for pandas
cleaned_csv = StringIO("\n".join(cleaned_lines))

# Read into pandas
df_freq = pd.read_csv(cleaned_csv, index_col=0)

In [3]:
# Data overview
df_freq.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
0,1.0,1.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
1,3.0,1.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
2,5.0,1.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22
3,10.0,1.0,0.09,,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
4,11.0,1.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72


In [4]:
df_freq.describe()

Unnamed: 0,IDpol,ClaimNb,Exposure,VehPower,VehAge,DrivAge,BonusMalus,Density
count,678013.0,678012.0,678012.0,678012.0,678012.0,678012.0,678012.0,678012.0
mean,2621857.0,0.053245,0.528749,6.454632,7.044251,45.499102,59.761506,1792.42028
std,1641783.0,0.240115,0.364441,2.050907,5.666225,14.137445,15.636669,3958.649096
min,1.0,0.0,0.002732,4.0,0.0,18.0,50.0,1.0
25%,1157951.0,0.0,0.18,5.0,2.0,34.0,50.0,92.0
50%,2272152.0,0.0,0.49,6.0,6.0,44.0,50.0,393.0
75%,4046274.0,0.0,0.99,7.0,11.0,55.0,64.0,1658.0
max,6114330.0,16.0,2.01,15.0,100.0,100.0,230.0,27000.0


Dataset contains 12 features of which:
 - one is a key of a row, doesn't have any predictive value and it will be removed
 - 7 are numerical
 - 4 are cathegorical

In [11]:
numerical = ["ClaimNb", "Exposure", "VehPower", "VehAge", "DrivAge", "BonusMalus", "Density"]
categorical = ["Area", "VehBrand", "VehGas", "Region"]

In [5]:
# removing IDpol
df_freq.drop('IDpol', axis=1, inplace=True)

In [6]:
# null values overview
df_freq.isnull().sum()

ClaimNb       1
Exposure      1
Area          1
VehPower      1
VehAge        1
DrivAge       1
BonusMalus    1
VehBrand      1
VehGas        1
Density       1
Region        1
dtype: int64

In [7]:
df_freq[df_freq.isnull().any(axis=1)]

Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
3,1.0,0.09,,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
567,,0.85,C,7.0,0.0,45.0,50.0,B12,Regular,309.0,R73
32341,0.0,0.32,C,7.0,11.0,47.0,72.0,B2,,168.0,R91
45732,0.0,1.0,A,,1.0,50.0,50.0,B3,Diesel,15.0,R24
71923,0.0,1.0,C,4.0,8.0,67.0,50.0,B5,Regular,432.0,
84919,0.0,1.0,C,5.0,5.0,30.0,,B1,Diesel,163.0,R24
99992,0.0,1.0,E,6.0,6.0,64.0,50.0,B2,Diesel,,R82
173492,0.0,,A,5.0,7.0,33.0,50.0,B13,Diesel,31.0,R24
184622,0.0,0.5,A,9.0,,47.0,79.0,B5,Regular,7.0,R93
184711,1.0,0.5,D,7.0,10.0,,50.0,B2,Regular,1440.0,R93


There are 11 rows with null values in dataset, which is approximatelly 0.0016% of the whole dataset. Pottential solution is to use some imputation method like KNNImputer or replacing those values with median. However, this amount of data won't have much impact on training model, so for the puropose of computing optimization I will simple delete rows with missing values

In [8]:
df_freq.dropna(inplace=True)
df_freq[df_freq.isnull().any(axis=1)]

Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region


In [None]:
# explorying outliers
def find_outliers_IQR(df: pd.DataFrame, threshold_IQR = 1.5):
    for i in df.columns:
        try:
            q1=df[i].quantile(0.25)
            q3=df[i].quantile(0.75)
            IQR=q3-q1
            min_bond = q1 - threshold_IQR * IQR
            max_bond = q3 + threshold_IQR * IQR
            outliers = df[i][((df[i] < min_bond) | (df[i] > max_bond))]
            print("Outliers in column: ", i)
            print(f"Bonds: {min_bond} / {max_bond}")
            print("Number of outliers: ", len(outliers))
            print("Number of outliers upper bond: ", sum(outliers > max_bond))
            print("Number of outliers lower bond: ", sum(outliers < min_bond))
            print("------------------------------")
        except TypeError:
            continue

def find_outliers_z_score(df: pd.DataFrame, threshold_z = 2):
    for i in df.columns:
        try:
            z = np.abs(stats.zscore(df[i]))
            
        except KeyError:
            continue

In [43]:
# df_freq[numerical].apply(func=find_outliers_IQR, axis=0)
find_outliers_IQR(df_freq)

Outliers in column:  ClaimNb
Bonds: 0.0 / 0.0
Number of outliers:  34057
Number of outliers upper bond:  34057
Number of outliers lower bond:  0
------------------------------
Outliers in column:  Exposure
Bonds: -1.0350000000000001 / 2.205
Number of outliers:  0
Number of outliers upper bond:  0
Number of outliers lower bond:  0
------------------------------
Outliers in column:  VehPower
Bonds: 2.0 / 10.0
Number of outliers:  35071
Number of outliers upper bond:  35071
Number of outliers lower bond:  0
------------------------------
Outliers in column:  VehAge
Bonds: -11.5 / 24.5
Number of outliers:  3114
Number of outliers upper bond:  3114
Number of outliers lower bond:  0
------------------------------
Outliers in column:  DrivAge
Bonds: 2.5 / 86.5
Number of outliers:  1275
Number of outliers upper bond:  1275
Number of outliers lower bond:  0
------------------------------
Outliers in column:  BonusMalus
Bonds: 29.0 / 85.0
Number of outliers:  62384
Number of outliers upper bond:

In [None]:
# TODO: wartości odstające
# TODO: rozkład zmiennych
# TODO: dodać claim frequency
# TODO: zbadać korelację
# TODO: zmienne ciągłe w zmiennych kategorycznych