## Importing packages and data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
freq_data = pd.read_csv(r'freMTPL2freq.csv')
freq_data.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Area,Density,Region
0,1.0,1,0.1,5,0,55,50,B12,Regular,D,1217,Rhone-Alpes
1,3.0,1,0.77,5,0,55,50,B12,Regular,D,1217,Rhone-Alpes
2,5.0,1,0.75,6,2,52,50,B12,Diesel,B,54,Picardie
3,10.0,1,0.09,7,0,46,50,B12,Diesel,B,76,Aquitaine
4,11.0,1,0.84,7,0,46,50,B12,Diesel,B,76,Aquitaine


In [5]:
sev_data = pd.read_csv(r'freMTPL2sev.csv')
sev_data.head()

Unnamed: 0,IDpol,ClaimAmount
0,1552,995.2
1,1010996,1128.12
2,4024277,1851.11
3,4007252,1204.0
4,4046424,1204.0


In [7]:
sev_grouped = sev_data.groupby("IDpol").agg(
    ClaimCount=("ClaimAmount", "count"),  # Liczba zgłoszeń dla danego ID
    TotalClaimAmount=("ClaimAmount", "sum")  # Suma wartości zgłoszeń
).reset_index()

sev_grouped.head()


Unnamed: 0,IDpol,ClaimCount,TotalClaimAmount
0,139,1,303.0
1,190,1,1981.84
2,414,1,1456.55
3,424,2,10834.0
4,463,1,3986.67


In [9]:
freq_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 678013 entries, 0 to 678012
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   IDpol       678013 non-null  float64
 1   ClaimNb     678013 non-null  int64  
 2   Exposure    678013 non-null  float64
 3   VehPower    678013 non-null  int64  
 4   VehAge      678013 non-null  int64  
 5   DrivAge     678013 non-null  int64  
 6   BonusMalus  678013 non-null  int64  
 7   VehBrand    678013 non-null  object 
 8   VehGas      678013 non-null  object 
 9   Area        678013 non-null  object 
 10  Density     678013 non-null  int64  
 11  Region      678013 non-null  object 
dtypes: float64(2), int64(6), object(4)
memory usage: 62.1+ MB


In [11]:
freq_data['IDpol'] = freq_data['IDpol'].astype('int')

In [16]:
sev_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26639 entries, 0 to 26638
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   IDpol        26639 non-null  int64  
 1   ClaimAmount  26639 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 416.4 KB


## Exploratory Data Analysis

In [None]:
sns.set_style("ticks")

plt.figure(figsize=(10, 6))
ax = sns.histplot(freq_data["ClaimNb"], bins=range(0, freq_data["ClaimNb"].max() + 1), kde=False, color="navy")

plt.title("Histogram of Claim Counts (ClaimNb)", fontsize=14)
plt.xlabel("Number of Claims", fontsize=12)
plt.ylabel("Number of Policies", fontsize=12)
plt.xticks(range(0, freq_data["ClaimNb"].max() + 1))

sns.despine()

plt.show()

In [34]:
freq_data.groupby('ClaimNb')['ClaimNb'].count()

ClaimNb
0     643953
1      32178
2       1784
3         82
4          7
5          2
6          1
8          1
9          1
11         3
16         1
Name: ClaimNb, dtype: int64

In [24]:
sev_grouped = sev_data.groupby("IDpol").agg(
    ClaimCount=("ClaimAmount", "count"),  
    TotalClaimAmount=("ClaimAmount", "sum") 
).reset_index()

print(sev_grouped.head())

   IDpol  ClaimCount  TotalClaimAmount
0    139           1            303.00
1    190           1           1981.84
2    414           1           1456.55
3    424           2          10834.00
4    463           1           3986.67
