In [1]:
#import the packages that we might use
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import imblearn
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor 
from sklearn.linear_model import ElasticNet,SGDRegressor,BayesianRidge,LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn import utils
import pytest

<font face="Times New Roman" size=5 color=#000000 > 
Load the data and build the models

In [2]:
#Load the data that have been processed.
df = pd.read_csv('10m_each_processed.csv',low_memory=False)
#df = pd.read_csv('1m_each_processed.csv',low_memory = False)
#df = pd.read_csv('data_df_15million.csv',low_memory = False)

In [None]:
#Devide the data into X and Y

X_data = pd.DataFrame(df, columns=['EESTATU', 'REGION','PROC1_CATEGORIES','PAY','NETPAY','COB','COINS','DEDUCT','AGE','SEX','REVCODE','TIME','INDSTRY',
'UNITS','EGEOLOC','HLTHPLAN','QTY','RX','YEAR','DIAGNOSIS_OVERLAY'])
Y_data = pd.DataFrame(df, columns = ['CLAIM'])

In [None]:
# Perform common cleaning and feature engineering tasks on datasets.
def prep_dataset(dataset):
    
    # ONE-HOT ENCODING
    
    cols = ['EESTATU','PROC1_CATEGORIES','REGION','REVCODE','SEX','INDSTRY','EGEOLOC','HLTHPLAN','RX','YEAR']
    
    for i in cols:
        dummies = pd.get_dummies(dataset[i], prefix = i, drop_first = False)
        dataset = pd.concat([dataset, dummies], axis = 1)

    return dataset

#X_train = prep_dataset(X_train.copy())

X_data = prep_dataset(X_data.copy())
#X_val = prep_dataset(X_val.copy())
print('The dataset has been cleaned and prepared.')

In [None]:
#Split the training and validation datasets and their labels.
X_test, X_val, y_test, y_val = train_test_split(X_data,Y_data,random_state = 1912)
print('The testing and test_validation datasets and labels have been split.')

In [None]:
# Drop unused columns from datasets.
def drop_unused(dataset):

    # These have been replaced with one-hot encoding.
    dataset = dataset.drop(['SEX'], axis = 1)
    dataset = dataset.drop(['REVCODE'], axis = 1)
    dataset = dataset.drop(['REGION'], axis = 1)
    dataset = dataset.drop(['PROC1_CATEGORIES'], axis = 1)
    dataset = dataset.drop(['INDSTRY'], axis = 1)
    dataset = dataset.drop(['EGEOLOC'], axis = 1)
    dataset = dataset.drop(['EESTATU'], axis = 1)
    dataset = dataset.drop(['HLTHPLAN'], axis = 1)
    dataset = dataset.drop(['RX'], axis = 1)
    dataset = dataset.drop(['YEAR'], axis = 1)
    
    return dataset

X_test = drop_unused(X_test.copy())

X_val = drop_unused(X_val.copy())

print('Columns that will not be used for testing have been dropped.')

<font face="Times New Roman" size=5 color=#000000 > 
Test for five models

In [None]:
from time import time
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,KFold

tree = DecisionTreeRegressor(random_state = 1024)

start = time()
tree.fit(X_test, np.ravel(y_test.astype(int)))
end = time()
train_time = (end - start) * 1000

prediction = tree.predict(X_val)

kf = KFold(n_splits = 5)

# Score using the validation data.
score = tree.score(X_val, y_val)
score_2 = cross_val_score(tree,X_test,y_test,cv = kf)

#Adjusted R square score using the validation data
n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Decision tree model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

In [None]:
from sklearn.ensemble import RandomForestRegressor

start = time()
forest = RandomForestRegressor(n_estimators = 100,
                                criterion = 'squared_error',
                                bootstrap = True,
                                oob_score = True,
                                random_state = 1024)

forest.fit(X_test, np.ravel(y_test))
end = time()
train_time = (end - start) * 1000

prediction = forest.predict(X_val)

kf = KFold(n_splits = 5)

# Score using the validation data.
score = forest.score(X_val, y_val)

#Adjusted R square score using the validation data
n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Random Forest model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

In [None]:
reg = LinearRegression()
reg.fit(X_test,y_test)
score = reg.score(X_val,y_val)

n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Linear Regression Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

In [None]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
clf.fit(X_test, y_test)
score = clf.score(X_val,y_val)

n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

print('Gradient Boosting Regression model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted R-Square Accuracy: {:.2f}%'.format(adj_score * 100))

In [None]:
from time import time
from sklearn.linear_model import Ridge

start = time()
ridge = Ridge(alpha=1.0)
ridge.fit(X_test,y_test)
end = time()
train_time = (end - start) * 1000

prediction = ridge.predict(X_val)

# R square score using the validation data.
score = ridge.score(X_val, y_val)

#Adjusted R square score using the validation data
n = len(X_val)
p = len(X_val.columns) - 1
adj_score = 1 - (1 - score)*(n-1)/(n-p-1)

#Print all the indicators that we need
print('Ridge Regression model took {:.2f} milliseconds to fit.'.format(train_time))
print('Accuracy: {:.2f}%'.format(score * 100))
print('Adjusted accuracy: {:.2f}%'.format(adj_score * 100))