## Machine Learning Analysis USA:
#### In this Jupyter Notebook you will find the process of exploratory Analysis of the file Clean_Insurance_USA.csv.

In [61]:
import pandas as pd
import numpy as np

In [62]:
usa = pd.read_csv('../Data/Clean_data/Clean_Insurance_USA.csv', index_col=0) #dataframe saved in us

In [63]:
usa.columns #Inspecting columns

Index(['Customer', 'State', 'Coverage', 'Education', 'Job_Status', 'Gender',
       'Income', 'Location', 'Civil_Status', 'Monthly_Price',
       'Months_LastClaim', 'Months_SinceActivation', 'Number_Open_Complaints',
       'Number_Insurances', 'Policy_Type', 'Sales_Channel', 'Car_Type',
       'Car_Size'],
      dtype='object')

Transformation of categorical data into boolean variables so that I can apply algorithms to it. I have decided to work with Civil_Status, Location, Policy_Type and Education because I think they are the treats that may influence having an accident.

In [18]:
#Gender dummy
dummy = pd.get_dummies(usa, columns = ['Civil_Status','Location', 'Policy_Type', 'Education'], drop_first = True)
usa_dummy = pd.concat([usa,dummy], axis=1)

Number_Open_Complaints is the feature that we want to predict, it is expressed in how many accidents did that person had in the last year, but for simplification, I will translate that into whether a customer had 1 or more accidents (1) or not (0).

In [19]:
usa['Number_Open_Complaints'] = usa.Number_Open_Complaints.apply(lambda x: 0 if x==0 else 1)

Separate our data in train and test, to check if our predictions are right or not.

In [20]:
from sklearn.model_selection import train_test_split #importing library

In [21]:
#test size of 0.2 and decided to use stratify to make sure the proportion of 0 and 1 in train and test is
#the same, to avoid bias on splitting the dataset.
X_train, X_test, y_train, y_test = train_test_split(usa_dummy[['Civil_Status_Married', 'Civil_Status_Single',
                                                               'Location_Suburban', 'Location_Urban', 
                                                               'Policy_Type_Personal Auto',
                                                               'Policy_Type_Special Auto', 'Education_College',
                                                               'Education_Doctor','Education_High School or Below', 
                                                               'Education_Master']], 
                                                    usa[['Number_Open_Complaints']],
                                                    test_size=0.2, stratify = usa[['Number_Open_Complaints']])

#### Building Supervised Learning algorithms.
This is a case of supervised learning since we are trying to predict one outcome that is on the dataset and hence, I will build algorithms that maximizes the number of True Positives, people that might have an accident, always looking at the accuracy of the model.

In [22]:
from sklearn import tree
from sklearn.metrics import confusion_matrix
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [23]:
prediction = clf.predict(X_test)

In [24]:
confusion_matrix(y_test, prediction) #All values are on the left (all Positives).

array([[1451,    0],
       [ 376,    0]])

In [64]:
#PREGUNTA A NACHO: Los positives son la gente que no tendrá accidente, no?
#PCA se debería hacer al principio de todo, o más tarde?

In [26]:
clf.score(X_test, y_test)*100

79.41981390257253

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train.values.reshape(-1,))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [29]:
y_pred = knn.predict(X_test)

In [32]:
confusion_matrix(y_test, y_pred) #More distributed values, but low accuracy.

array([[1296,  155],
       [ 332,   44]])

In [31]:
knn.score(X_test, y_test)*100 

73.34428024083196

In [41]:
knn = KNeighborsClassifier(n_neighbors=6, metric='euclidean')
knn.fit(X_train, y_train.values.reshape(-1,))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='uniform')

In [42]:
y_pred = knn.predict(X_test)

In [43]:
confusion_matrix(y_test, y_pred)

array([[1332,  119],
       [ 337,   39]])

In [44]:
knn.score(X_test, y_test)*100 #Accuracy of 75%. Tested for 7 n_neighbors, went down to 71%

75.04105090311987

In [46]:
knn = KNeighborsClassifier(n_neighbors=6, metric='cosine')
knn.fit(X_train, y_train.values.reshape(-1,))
y_pred = knn.predict(X_test)
confusion_matrix(y_test, y_pred) #75 accuracy

array([[1230,  221],
       [ 330,   46]])

In [47]:
knn.score(X_test, y_test)*100 #Accuracy of 69% with different distance metric

69.84126984126983

In [48]:
from sklearn.svm import LinearSVC

In [49]:
svc = LinearSVC()

In [50]:
svc.fit(X_train, y_train.values.reshape(-1,))

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [51]:
y_predi = svc.predict(X_test)
confusion_matrix(y_test, y_predi)

array([[1451,    0],
       [ 376,    0]])

In [52]:
from sklearn.linear_model import LogisticRegression

ks_model = LogisticRegression().fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [53]:
y_pred_test = ks_model.predict(X_test)
#y_pred_test
confusion_matrix(y_test, y_pred_test)

array([[1451,    0],
       [ 376,    0]])

In [55]:
from sklearn.decomposition import PCA 
  
pca = PCA(n_components = 5) 
  
X_train = pca.fit_transform(X_train) 
X_test = pca.transform(X_test) 
  
explained_variance = pca.explained_variance_ratio_ 

In [56]:
explained_variance #With these components, 83% of the variance is explained.

array([0.27254246, 0.18902251, 0.1643694 , 0.12513085, 0.08691207])

In [58]:
from sklearn.linear_model import LogisticRegression   
  
classifier = LogisticRegression(random_state = 0) 
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
y_pred = classifier.predict(X_test) 

In [60]:
confusion_matrix(y_test, y_pred)

array([[1451,    0],
       [ 376,    0]])

In [13]:
from keras import models
from keras.models import Sequential
from keras.layers import Dense, Activation
accidents = Sequential()
accidents.add(Dense(units=200, kernel_initializer='uniform', activation='relu', input_dim=10))
accidents.add(Dense(units=100, kernel_initializer='uniform', activation='relu'))
accidents.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
accidents.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
accidents.fit(X_train, y_train, batch_size=5, epochs=10)
val_loss, val_acc = accidents.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {val_loss}')
print(f'Test accuracy: {val_acc}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 0.529344087457422
Test accuracy: 0.7941981554031372


In [None]:
#Imbalance classes Target tenga mas 1 que 0, stratify en el split test o sino con oversampling y undersampling
#Overfitting confusion matrix para ver si predigo perfectamente o no --> Cross validation or k-folds para evitar oversampling
#Probar otros modelos 

In [None]:
clf.score(X_test, y_test)