<a href="https://colab.research.google.com/github/JuanLara18/Actuarial-DataMined-Provisiones/blob/main/notebooks/Data_Understanding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

data = pd.read_csv("wkcomp_pos.csv")

In [2]:
# Display basic information about the dataset
print("Dataset Information:")
print("Number of Records:", data.shape[0])
print("Number of Attributes:", data.shape[1])

Dataset Information:
Number of Records: 13200
Number of Attributes: 13


In [3]:
# Display data types of attributes
print("\nData Types of Attributes:")
print(data.dtypes)


Data Types of Attributes:
GRCODE                int64
GRNAME               object
AccidentYear          int64
DevelopmentYear       int64
DevelopmentLag        int64
IncurLoss_D           int64
CumPaidLoss_D         int64
BulkLoss_D            int64
EarnedPremDIR_D       int64
EarnedPremCeded_D     int64
EarnedPremNet_D       int64
Single                int64
PostedReserve97_D     int64
dtype: object


In [4]:
# Check unique values and coding schemes for specific attributes
print("\nCoding Schemes:")
print("Number of unique values in 'GRCODE':", len(data['GRCODE'].unique()))
print("Number of unique values in 'GRNAME':", len(data['GRNAME'].unique()))
print("Number of unique values in 'AccidentYear':", len(data['AccidentYear'].unique()))
print("Number of unique values in 'DevelopmentYear':", len(data['DevelopmentYear'].unique()))


Coding Schemes:
Number of unique values in 'GRCODE': 132
Number of unique values in 'GRNAME': 132
Number of unique values in 'AccidentYear': 10
Number of unique values in 'DevelopmentYear': 19


In [5]:
# Extract de information in DataFrames
Important_Variables = data.columns.delete([0,1,2,3,4,-1,-2,-3,-4,-5])
DataComplete = {column : [] for column in Important_Variables}
size = data.shape[0]

for Variable in Important_Variables:
  count = 0
  while(size - 100*count > 1):
    base_Variable = np.zeros((10,10))
    for i in range(10):
      for j in range(10):
        base_Variable[i][j] = data[Variable][i*10+j + 10*count]
    count = count + 1
    DataComplete[Variable].append(pd.DataFrame(base_Variable))

In [6]:
# Construct the Triangles
def IncompleteDataFrame(x):
  matrix = x.values
  for i in range(9,0,-1):
    for j in range(10-i,10,1):
        matrix[i][j] = 0
  return pd.DataFrame(matrix)

Triangles = {}
for variable, array in DataComplete.items():
  Triangles[variable] = [IncompleteDataFrame(x) for x in array]

In [7]:
# Find the basics statistics component by component for each variable
Means = {}
for variable, array in Triangles.items():
  Means[variable] = pd.concat(array).groupby(level=0).mean()

Var = {}
for variable, array in Triangles.items():
  Var[variable] = pd.concat(array).groupby(level=0).var()

Min = {}
for variable, array in Triangles.items():
  Min[variable] = pd.concat(array).groupby(level=0).min()

Max = {}
for variable, array in Triangles.items():
  Max[variable] = pd.concat(array).groupby(level=0).max()

Median = {}
for variable, array in Triangles.items():
  Median[variable] = pd.concat(array).groupby(level=0).median()

In [8]:
# Define the function to plot the series
def DrawGraph(x, name):
  matrix = x.values
  plt.figure(figsize=(9, 7))
  plt.plot()

  for i in range(10):
    val = matrix[i]
    for j in range(i):
      val = val[:-1]
    rang = range(1988,1988+len(val))
    plt.plot(rang, val, marker='o', linestyle='-', label=1988+i)

  plt.xlabel('Years')
  plt.ylabel('Value')
  plt.title(name)

  plt.savefig(name, dpi=300, bbox_inches='tight')

  plt.legend()

In [None]:
# Export the graphs of the basis statistics
for var in Important_Variables:
  DrawGraph(Means[var], 'Mean: ' + var)

for var in Important_Variables:
  DrawGraph(Var[var], 'Variance: ' + var)

for var in Important_Variables:
  DrawGraph(Median[var], 'Median: ' + var)

In [52]:
# PCA
from sklearn.decomposition import PCA

def dataframe_to_pca(dataframes, n_components):

  X = np.array([])
  for i in range(len(dataframes)):
    flattened_vector = dataframes[i].values.flatten()
    X = np.append(X, flattened_vector)

  X = X.reshape(len(dataframes), -1)

  pca = PCA(n_components=n_components)
  pca_result = pca.fit_transform(X)

  return pca_result

cp = dataframe_to_pca(Triangles['BulkLoss_D'], 3)

In [53]:
cp

array([[ 2.66298870e+05, -5.81457155e+04, -1.99504561e+04],
       [ 2.24260936e+05, -7.96192811e+04,  7.10807243e+03],
       [ 1.82804271e+05, -8.30475742e+04,  5.01798499e+04],
       [ 1.38168302e+05, -6.39597896e+04,  7.77335474e+04],
       [ 8.96626856e+04, -2.84211346e+04,  6.86342084e+04],
       [ 5.93489601e+04,  6.77649170e+03,  5.73751852e+04],
       [ 4.48622257e+04,  3.81777958e+04,  4.58050430e+04],
       [ 4.26543340e+04,  6.28429695e+04,  3.23710095e+04],
       [ 4.72903972e+04,  7.86555479e+04,  1.47463309e+04],
       [ 6.50981266e+04,  7.51172639e+04,  1.03319164e+04],
       [ 8.34984875e+04,  6.15968270e+04,  4.92184691e+03],
       [ 8.71247884e+04,  4.63117697e+04, -2.39578966e+04],
       [ 8.79804767e+04,  2.14688789e+04, -4.58610595e+04],
       [ 8.73285752e+04, -8.66881338e+03, -4.44643568e+04],
       [ 8.63908674e+04, -4.02105113e+04, -1.72300806e+04],
       [ 7.15822396e+04, -5.45990437e+04,  7.80878901e+03],
       [ 4.63060630e+04, -5.20523037e+04

In [51]:
len(cp)

132