## Dependencias

In [70]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score


from sklearn.feature_selection import SelectKBest
#from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neural_network import MLPClassifier

from functools import reduce
from category_encoders import OneHotEncoder

cf.go_offline()
pd.set_option('display.max_columns', None)

## Lectura de datos

In [4]:
ruta = "/media/jose/090f6b94-de30-4aaf-9f8a-4e18b120d7f6/bd/01. Simples/Insurance claim fraud/insurancefraud.csv"

In [20]:
datos = pd.read_csv(ruta)
datos.shape

(15420, 33)

In [21]:
datos.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [22]:
datos.value_counts('FraudFound_P', normalize=True)

FraudFound_P
0    0.940143
1    0.059857
Name: proportion, dtype: float64

## Selección de variables de decisión (vector de características $\vec{x}$)

In [23]:
var = ['Make',
'PastNumberOfClaims',
'DriverRating',
'AgeOfVehicle',
'Age']
vart = ['FraudFound_P']

In [24]:
datos = datos[var+vart]

In [25]:

datos.head()

Unnamed: 0,Make,PastNumberOfClaims,DriverRating,AgeOfVehicle,Age,FraudFound_P
0,Honda,none,1,3 years,21,0
1,Honda,none,4,6 years,34,0
2,Honda,1,3,7 years,47,0
3,Toyota,1,2,more than 7,65,0
4,Honda,none,1,5 years,27,0


### Normalización de variables discretas 

In [31]:
datos['Make'] = np.where(datos['Make'].isin(['Pontiac','Toyota','Honda','Mazda','Chevrolet']),datos['Make'],'Other')
datos['AgeOfVehicle'] = np.where(datos['AgeOfVehicle'].isin(['7 years','more than 7','6 years']),datos['AgeOfVehicle'],'<=5')

## Partición de datos

In [43]:
X = datos[var].copy()
y = datos[vart[0]].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10794, 5), (4626, 5), (10794,), (4626,))

### Codificación de categorías

In [44]:
vard = ['Make','PastNumberOfClaims','DriverRating','AgeOfVehicle']

In [45]:
oh = OneHotEncoder(cols=vard,use_cat_names=True)

In [46]:
oh.fit(X_train[vard])

In [47]:
oh.transform(X_train[vard])

Unnamed: 0,Make_Honda,Make_Toyota,Make_Mazda,Make_Chevrolet,Make_Pontiac,Make_Other,PastNumberOfClaims_more than 4,PastNumberOfClaims_1,PastNumberOfClaims_none,PastNumberOfClaims_2 to 4,DriverRating_2.0,DriverRating_1.0,DriverRating_3.0,DriverRating_4.0,AgeOfVehicle_<=5,AgeOfVehicle_7 years,AgeOfVehicle_6 years,AgeOfVehicle_more than 7
0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
3,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10789,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0
10790,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
10791,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1
10792,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0


In [49]:
X_train[oh.get_feature_names_out()] = oh.transform(X_train[vard])

In [50]:
X_train.head()

Unnamed: 0,Make,PastNumberOfClaims,DriverRating,AgeOfVehicle,Age,Make_Honda,Make_Toyota,Make_Mazda,Make_Chevrolet,Make_Pontiac,Make_Other,PastNumberOfClaims_more than 4,PastNumberOfClaims_1,PastNumberOfClaims_none,PastNumberOfClaims_2 to 4,DriverRating_2.0,DriverRating_1.0,DriverRating_3.0,DriverRating_4.0,AgeOfVehicle_<=5,AgeOfVehicle_7 years,AgeOfVehicle_6 years,AgeOfVehicle_more than 7
0,Honda,more than 4,2,<=5,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
1,Toyota,1,1,7 years,45,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
2,Honda,none,3,7 years,36,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
3,Mazda,1,1,7 years,62,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
4,Honda,2 to 4,2,6 years,50,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0


## Aprendizaje

### Modelos lineales 

#### Clasificador Regresión Logística

In [71]:
lr= LogisticRegression()

#### Híper-parametrización

In [75]:
params = {
    'C':np.arange(0.01,0.1,0.01),
    'fit_intercept': [True, False],
    'max_iter': range(100,1000,100),
    'tol': [1e-3, 1e-4, 1e-5]
}
reduce(lambda x,y: x*y, [len(i) for i in params.values()])

486

In [76]:
grid = GridSearchCV(estimator=lr, param_grid=params, cv=3, n_jobs=-1, verbose=1, scoring='roc_auc')

In [77]:
grid.fit(X_train.drop(vard,axis=1), y_train)

Fitting 3 folds for each of 486 candidates, totalling 1458 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [78]:
lr = grid.best_estimator_
lr

#### Aprendizaje de parámetros $\vec{\theta},\theta_0$

In [79]:
lr.fit(X_train.drop(vard,axis=1), y_train)

In [81]:
lr.intercept_, lr.coef_

(array([-2.63125423]),
 array([[-0.00503688, -0.00517055, -0.08623107, -0.10625485, -0.04218244,
         -0.08766188,  0.32661596, -0.39899298,  0.1186221 ,  0.35744858,
         -0.07796253, -0.03060562,  0.03924767,  0.03229051, -0.04181739,
          0.09845692, -0.09593094,  0.10865116, -0.11206198]]))

#### Generalización del modelo

In [82]:
X_test[oh.get_feature_names_out()] = oh.transform(X_test[vard])

In [83]:
accuracy_score(y_train, lr.predict(X_train.drop(vard,axis=1))),accuracy_score(y_test, lr.predict(X_test.drop(vard,axis=1)))

(0.9423753937372614, 0.9349329874621704)

In [84]:
display(confusion_matrix(y_train, lr.predict(X_train.drop(vard,axis=1))))
display(confusion_matrix(y_test, lr.predict(X_test.drop(vard,axis=1))))

array([[10172,     0],
       [  622,     0]])

array([[4325,    0],
       [ 301,    0]])

In [86]:
display(roc_auc_score(y_true=y_train, y_score=lr.predict_proba(X_train.drop(vard,axis=1))[:,1]))
display(roc_auc_score(y_true=y_test, y_score=lr.predict_proba(X_test.drop(vard,axis=1))[:,1]))

0.5929136220353963

0.5836817544600849