In [2]:
#import the packages that we might use
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import imblearn
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor 
from sklearn.linear_model import ElasticNet,SGDRegressor,BayesianRidge,LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn import utils

<font face="Times New Roman" size=5 color=#000000 > 
Load the data

In [19]:
#Load the data that have been processed.
df = pd.read_csv('10m_each_processed.csv',low_memory=False)

<font face="Times New Roman" size=5 color=#000000 > 
Build the models

In [29]:
#Devide the data into X and Y

X_data = pd.DataFrame(df, columns=['EESTATU', 'REGION','PROC1_CATEGORIES','PAY','NETPAY','COB','COINS','DEDUCT','AGE','SEX','REVCODE','TIME','INDSTRY',
'UNITS','EGEOLOC','HLTHPLAN','QTY','RX','YEAR','DIAGNOSIS_OVERLAY'])
Y_data = pd.DataFrame(df, columns = ['CLAIM'])

In [31]:
# Perform common cleaning and feature engineering tasks on datasets.
def prep_dataset(dataset):
    
    # ONE-HOT ENCODING
    
    cols = ['EESTATU','PROC1_CATEGORIES','REGION','REVCODE','SEX','INDSTRY','EGEOLOC','HLTHPLAN','RX','YEAR']
    
    for i in cols:
        dummies = pd.get_dummies(dataset[i], prefix = i, drop_first = False)
        dataset = pd.concat([dataset, dummies], axis = 1)

    return dataset

#X_train = prep_dataset(X_train.copy())

X_data = prep_dataset(X_data.copy())
#X_val = prep_dataset(X_val.copy())
print('The dataset has been cleaned and prepared.')

The dataset has been cleaned and prepared.


In [30]:
#Split the training and validation datasets and their labels.
X_train, X_val, y_train, y_val = train_test_split(X_data,Y_data,random_state = 1912)
print('The training and validation datasets and labels have been split.')

The training and validation datasets and labels have been split.


In [32]:
# Drop unused columns from datasets.
def drop_unused(dataset):

    # These have been replaced with one-hot encoding.
    dataset = dataset.drop(['SEX'], axis = 1)
    dataset = dataset.drop(['REVCODE'], axis = 1)
    dataset = dataset.drop(['REGION'], axis = 1)
    dataset = dataset.drop(['PROC1_CATEGORIES'], axis = 1)
    dataset = dataset.drop(['INDSTRY'], axis = 1)
    dataset = dataset.drop(['EGEOLOC'], axis = 1)
    dataset = dataset.drop(['EESTATU'], axis = 1)
    dataset = dataset.drop(['HLTHPLAN'], axis = 1)
    dataset = dataset.drop(['RX'], axis = 1)
    dataset = dataset.drop(['YEAR'], axis = 1)
    
    return dataset

X_train = drop_unused(X_train.copy())

X_val = drop_unused(X_val.copy())

print('Columns that will not be used for training have been dropped.')

Columns that will not be used for training have been dropped.


In [24]:
X_train.head()

Unnamed: 0,PAY,NETPAY,COB,COINS,DEDUCT,AGE,TIME,UNITS,QTY,DIAGNOSIS_OVERLAY,...,"EGEOLOC_(14.714, 28.429]","EGEOLOC_(28.429, 42.143]","EGEOLOC_(42.143, 55.857]","EGEOLOC_(55.857, 69.571]","EGEOLOC_(83.286, 97.0]",HLTHPLAN_0,RX_1,YEAR_2018,YEAR_2019,YEAR_2020
464007,0.0,0.0,0.0,0.0,0.0,81.0,1.0,1.0,1,4,...,1,0,0,0,0,1,1,0,1,0
485852,216.85,216.85,0.0,0.0,0.0,77.0,1.0,8.0,8,4,...,1,0,0,0,0,1,1,0,1,0
444149,205.2,205.2,0.0,0.0,0.0,65.0,1.0,1.0,1,1,...,1,0,0,0,0,1,1,0,1,0
482164,625.88,167.63,458.25,0.0,0.0,82.0,1.0,1.0,1,4,...,1,0,0,0,0,1,1,0,1,0
488905,0.0,0.0,0.0,0.0,0.0,86.0,1.0,1.0,1,3,...,1,0,0,0,0,1,1,0,1,0


In [25]:
X_val.head()

Unnamed: 0,PAY,NETPAY,COB,COINS,DEDUCT,AGE,TIME,UNITS,QTY,DIAGNOSIS_OVERLAY,...,"EGEOLOC_(14.714, 28.429]","EGEOLOC_(28.429, 42.143]","EGEOLOC_(42.143, 55.857]","EGEOLOC_(55.857, 69.571]","EGEOLOC_(83.286, 97.0]",HLTHPLAN_0,RX_1,YEAR_2018,YEAR_2019,YEAR_2020
462956,0.0,0.0,0.0,0.0,0.0,72.0,1.0,1.0,1,4,...,1,0,0,0,0,1,1,0,1,0
251373,22.99,4.6,18.39,0.0,0.0,73.0,1.0,1.0,1,4,...,1,0,0,0,0,1,1,0,0,1
69498,0.0,0.0,0.0,0.0,0.0,77.0,1.0,1.0,1,1,...,0,0,1,0,0,1,1,0,0,1
63702,0.0,0.0,0.0,0.0,0.0,82.0,1.0,1.0,1,3,...,0,0,1,0,0,1,1,0,0,1
266326,0.0,0.0,0.0,0.0,0.0,93.0,1.0,1.0,1,4,...,0,0,1,0,0,1,1,0,0,1


In [26]:
y_val.head()

Unnamed: 0,CLAIM
462956,0.0
251373,18.39
69498,0.0
63702,0.0
266326,0.0


<font face="Times New Roman" size=5 color=#000000 > 
Predict 'claim': Desicion Tree Regression

In [33]:
from time import time
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,KFold

tree = DecisionTreeRegressor(random_state = 1024)

start = time()
tree.fit(X_train, np.ravel(y_train.astype(int)))
end = time()
train_time = (end - start) * 1000

prediction = tree.predict(X_val)

kf = KFold(n_splits = 5)

# Score using the validation data.
score = tree.score(X_val, y_val)
score_2 = cross_val_score(tree,X_train,y_train,cv = kf)

#Adjusted R square score using the validation data
n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Decision tree model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

Decision tree model took 2440.71 milliseconds to fit.
Accuracy: 99.46%
Adjusted R-Square Accuracy: 99.46%


<font face="Times New Roman" size=5 color=#000000 > 
Predict 'claim': Random Forest Regression

In [34]:
from sklearn.ensemble import RandomForestRegressor

start = time()
forest = RandomForestRegressor(n_estimators = 100,
                                criterion = 'squared_error',
                                bootstrap = True,
                                oob_score = True,
                                random_state = 1024)

forest.fit(X_train, np.ravel(y_train))
end = time()
train_time = (end - start) * 1000

prediction = forest.predict(X_val)

kf = KFold(n_splits = 5)

# Score using the validation data.
score = forest.score(X_val, y_val)

#Adjusted R square score using the validation data
n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Random Forest model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

Random Forest model took 233413.74 milliseconds to fit.
Accuracy: 99.48%
Adjusted R-Square Accuracy: 99.48%


<font face="Times New Roman" size=5 color=#000000 >
Prediction: Simple Linear Regression（Multiple dimensions)

In [35]:
reg = LinearRegression()
reg.fit(X_train,y_train)
score = reg.score(X_val,y_val)

n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

Accuracy: 100.00%
Adjusted R-Square Accuracy: 100.00%


<font face="Times New Roman" size=5 color=#000000 >
Prediction: Gradient Boosting Regression（Multiple dimensions)

In [36]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
clf.fit(X_train, y_train)
score = clf.score(X_val,y_val)

n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Gradient Boosting Regression model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

  y = column_or_1d(y, warn=True)


Gradient Boosting Regression model took 233413.74 milliseconds to fit.
Accuracy: 99.66%
Adjusted R-Square Accuracy: 99.66%


<font face="Times New Roman" size=5 color=#000000 >
Prediction: The Ridge Regression

In [37]:
from time import time
from sklearn.linear_model import Ridge

start = time()
ridge = Ridge(alpha=1.0)
ridge.fit(X_train,y_train)
end = time()
train_time = (end - start) * 1000

prediction = ridge.predict(X_val)

# R square score using the validation data.
score = ridge.score(X_val, y_val)

#Adjusted R square score using the validation data
n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

#Print all the indicators that we need
print('Ridge Regression model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted accuracy: {:.2f}%'.format(adj_score * 100))

Ridge Regression model took 67.94 milliseconds to fit.
Accuracy: 100.00%
Adjusted accuracy: 100.00%
