In [None]:
import numpy as np
import pandas as pd
import re
import sklearn.metrics as metrics
import time

from collections import Counter

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, plot_tree

from sklearn.linear_model import LogisticRegression
from sklearn import tree,svm
from sklearn.metrics import classification_report, precision_score, recall_score,accuracy_score,confusion_matrix,roc_curve,auc
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score, KFold
from sklearn.base import TransformerMixin,BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

from matplotlib import pyplot as plt
from tqdm import tqdm

from IPython.display import display, clear_output

### Generational Data Subset
__Overview:__ In an earlier statistical analysis, we discovered a normally distributed generational trend with the mode at the age of 23 and drifting to 43 over the 20 year interval. The purpose of the generation_data() function is to get a subset of the patient data defined by one of two criteria:
1. Include patient data involving age within 2 standard deviations of the norm of the yearly opiate presciption distribution over the specified time interval
3. Only include patient data with age equal to 'start'

In [None]:
def generation_data(df,marching=True,start=1,end=33,plott=False):
    
    '''
    Input:
    -----------
    df        :  Pandas Dataframe - The data to get a subset of
    marching  :  Boolean          - If True will return patients between the ages 'start' and 'end'
                                    where each year 'start' and 'end' are incremented by 1
    start     :  Integer          - Youngest age to include, if 'indiv' is set to 'True' then the 
                                    data will only include patients with age equal to the value of 'start'
    end       :  Integer          - Oldest age to include
    plott     :  Boolean          - If True, it will plot histograms of the newly created datasets
    
    Output:
    -----------
    generational : Pandas Dataframe - The modified data 
    '''
    
    years = list(set(df.YEAR))    # Get every year included in the dataset
    generational = pd.DataFrame() # Create an empty dataframe
    
    # If indicated, Perform the marching sampling
    if marching:
      
        for year in years: # Iterate through each year
            
            current_year = df[df.YEAR==year] # Get all patients involved in the current year
            
            # Remove everyone younger than 'start' and older than 'end'
            generation = current_year[current_year['AGE'] >= start] 
            generation = generation[generation['AGE'] <= end]
            
            # Make generation into a dataframe
            YEAR = pd.DataFrame(generation)
            
            # Only get the instances where people received opiates
            rec = YEAR[YEAR['received2']==1]
            
            if plott: # If 'plott' is set to 'True'

                rec.hist(['AGE']) # Then plot the instances of people receiving opiates
                plt.show()

                
                rec = YEAR[YEAR['received2']==0] # and plot the instances of people not receiveing opiates
                rec.hist(['AGE'])
            
            # Append the resulting data to 'generational'
            generational = generational.append(YEAR)
            
            # Increment 'start' and 'end' to show patients have grown one year
            start+=1
            end+=1
        
        return generational
    
    # If 'marching' is 'False', it is assumed individual sampling is desired
    else: 
        
        for year in years:  # Iterate through each year
            
            # Get all patients involved in the current year
            current_year = df[df.YEAR==year]
            
            # Get all patients with the age equal to 'start'
            generation = current_year[current_year['AGE'] == start]
            
            # Make generation into a dataframe
            YEAR = pd.DataFrame(generation)
            
            if plott: # If 'plott' is set to 'True'
                
                rec = YEAR[YEAR['received2']==1]  # Only get the instances where people received opiates
                rec.hist(['AGE']) # Plot the instances of people receiving opiates
                plt.show()

                
                rec = YEAR[YEAR['received2']==0] # Only get instances where people did not receive opiates
                rec.hist(['AGE']) # Plot the instances of people not receiveing opiates
                plt.show()
            
            # Put generation into a dataframe
            generational = generational.append(pd.DataFrame(generation))
            
        return generational 

### Further Data Cleaning and Feature Engineering
__Overview:__ The data_prep() function formats the data for machine learning purposes.

In [None]:

def data_prep(df,N=False,smote=False,pre=False,prepre=True):
    
    '''
    PURPOSE: 
        
        Engineer features of the given dataframe and separate it into it's features and labels in prep
        for machine learning
    
    PARAMETERS:
    
        df : Pandas Dataframe
        N  : Boolean indicating whether df is the non_deterministic dataset(See Line #54)
        smote : Boolean indicating wheter to perform SMOTE sampling     
        
    RETURNS:
        
        X_train : Training features
        X_test  : Testing features
        y_train : Training labels
        y_test  : Testing labels
        X       : The original dataset
    '''
    
    if prepre:
        df = df.fillna(0).sort_values(by=['YEAR'])  # Replace all NaN's with 0 and sort the values by the 'YEAR' feature
        years = df.YEAR.tolist()                    # Store 'YEAR' column as a list called years
        ages = df.AGE.tolist()                      # Store 'AGE' column as a list called ages 

        diagnoses = df.groupby(['DIAG1'])['AGE'].count().reset_index('DIAG1')                   # Store 'DIAG1' column as a dataframe called Diagnoses
        top_diagnoses = diagnoses.sort_values(by='AGE', ascending=False)['DIAG1'].values[:20]   # Sort diagnoses from most prevalent to least prevalent


        # If we have the non_deterministic dataset, drop the 'AGE' column
        if N: 
            df = df.drop('AGE',axis=1)

        df.loc[~df.DIAG1.isin(top_diagnoses),'DIAG1'] = '0'   # Set any diagnosis not in the top 20 to 0
        df2 = pd.get_dummies(df["DIAG1"], drop_first=True)    # Make string-encoded, categorical data (diagnosis) into dummies dataframe
        df = pd.concat((df,df2),axis=1)                       # Combine dataframes
        df = df.drop('DIAG1',axis=1)                          # Drop the 'DIAG1' column from df since we are done with it
        df = df.dropna(axis = 0, how='any')                   # Drop any remaining rows with Nan's (if there are any)

    if pre:
        return df
    
    y = np.array(df['received2'])                         # The data labels are the 'received2' column of df
    
    X = df.drop(['received2','YEAR','REGION'],axis=1)     # The data features are everything else minus 'received2','YEAR','REGION' columns
    
    
    
    # Perform a train test split on X and y
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.33,shuffle=True,random_state=42)
    
    # If SMOTE sampling is requested,
    if smote:
        
        # Build a SMOTE object
        sm = SMOTE(random_state = 42)
        
        # Perform SMOTE sampling on the training data
        X_train, y_train = sm.fit_resample(X_train,y_train)
        
        X_train = pd.DataFrame(X_train)
        X_train.columns = X_test.columns
        
#         y_train = pd.DataFrame(y_train)
#         y_train.columns = y_test.columns
    
    # Return the dataframe, features, and labels
    return X_train,X_test,y_train,y_test,X

### Building the Model
__Overview:__ The purpose of the Model() function is to create a model trained on the data.

In [None]:
def Model(X_train,X_test,y_train,y_test,depth=11,estims=10,clf='RF'):
    
    ''' 
    input: 
        X      : Dataframe of dataset features
        y      : Dataframe of dataset labels corresponding to X
        smote  : Boolean indicating whether or not to use SMOTE sampling
        depth  : Integer , if the model type is a forest, indicates maximum tree depth in the forest
        estims : Integer indicating the number of estimators to use
        clf    : String, 'RF','XGB' referring to the model to use
    
    output:
        Model   : The model trained on the data
        predict : The predictive scores of the model    
    '''
    
    # If the requested model is a Random Forest
    if clf == 'RF':
        
        # Build a Random Forest Classifier Object with maximum depth of depth and estims number of n_estimators
        Model = RandomForestClassifier(max_depth=depth, n_estimators = estims)
        
        # Fit the model to our training set
        Model.fit(X_train,y_train)
        
        # Predict the model on our test features
        Predict = Model.predict(X_test)
        
    # If the requested model is a Boosted Random Forest
    elif clf =='XGB':
        
        # Build a Boosted Random Forest Classifier with maximum depth of depth
        Model = XGBClassifier(max_depth=depth)
        
        # Fit the model to the training set
        Model.fit(X_train,y_train)
        
        # Predict the model on our test features
        Predict = Model.predict(X_test)
        
    return Model,Predict
    

### Scoring the Model
__Overview:__ The calc_metrics() function returns a number of scores and metrics for the given model on used data.

In [None]:
def calc_metrics(X_test,y_test,Model,Predict,clf,roc=False):

    '''
    input:
        X_test  : A dataframe of the test data
        y_test  : A dataframe of the corresponding labels to X_test
        Model   : The trained model using X_train, y_train
        Predict : The scores of the model on the test data
        clf     : A string - 'RF','XGB' determining the type of model used
        roc     : A boolean, if True will generate the roc curve of Model
        
    output:
        M           : the confusion matrix for the model
        importances : a list of the importance of each feature in classifying the data
        indices     : A list of the indices of each important feature, ranked by importance
        Score       : The score of the model
        fpr         : false positive rate
        tpr         : true positive rate
        roc_auc     : area under the roc curve
    '''
    
    # Calculate the confusion matrix using our test labels and test predictions
    M = confusion_matrix(y_test, Predict)

    # Retrieve the most important features for classification fromt he model
    importances = Model.feature_importances_
    
    # Get the indices of the important features
    indices = np.argsort(importances)[::-1]
    
    if clf=='XGB':
        std = None
    
    # If the model was Random Forest
    if clf=='RF':
        
        # Get the standard deviation of each feature importance
        std = np.std([tree.feature_importances_ for tree in Model.estimators_],
                 axis=0)
  
    # Get the model's score from the test data
    Score = Model.score(X_test,y_test)

    # If ROC Curve was requested
    if roc:
        
        # Get the predictive probabilities of the X_test set
        probs = Model.predict_proba(X_test)
        
        # Get the 2nd column of the probabilities list, this is the predictive rate
        preds = probs[:,1]
        
        # Get the False Positive Rate, True Positive Rate, and Threshholds 
        fpr, tpr, thresh = roc_curve(y_test,preds)
        
        # Get the area under the ROC Curve
        roc_auc = metrics.auc(fpr,tpr)
    
        return M,importances,indices,Score,fpr,tpr,roc_auc
    return M,importances,indices,Score

### Convey the Results of the Data
__Overview:__ The Model_Report() function condenses the results of running the model and saves the data to a txt file, communicating it to the user.

In [None]:
def Model_Report(name,y_train,Score,M,y_test,Predict,importances,indices,fpr,tpr,X,smote=False):
    """
    input:
        name        : String - the user input name of the model
        y_train     : Dataframe - the labels for the train dataset
        Score       : The score of the model on the test data
        M           : Array - The confusion matrix for the model
        y_test      : Dataframe - the labels for the test dataset
        Predict     : The prediction score of the model
        importances : List - the features ranked according to importance
        indices     : List - the indices of each feature
        fpr         : float - false positive rate
        tpr         : float - true positive rate
        X           : Dataframe - the train and test data 
        smote       : Boolean - If True will include SMOTE information
        
    output:
        dom1 : the domain for the roc Curve
        ran1 : the range for the roc curve
    """
    
    # Open the file for writing
    f = open('REPORT FOR '+name,'w')
    print('\n\n-----------------------------------------------------------')
    f.write('\n\n-----------------------------------------------------------')
    f.write('\nREPORT FOR '+name)
    if smote:
        print('\n\nStats With Smote\n\n')
        print('\nSmote dataset shape %s \n' % Counter(y_train))
        f.write('\n\nStats With Smote\n\nSmote dataset shape %s \n' % Counter(y_train))
    else:
        print('\nStats Without Smote\n')
        print('Original dataset shape %s \n' % Counter(y_train))
        f.write('\nStats Without Smote\nOriginal dataset shape %s \n' % Counter(y_train))
 
    print('Score:\t\t\t', round(Score,4))
    print('\n\nCorrectly Predicted Not Received :\t',round(M[0][0],4))
    print('Incorrectly Predicted Not Received:\t',round(M[0][1],4))
    print('Accuracy Predicting Not Received:\t',round(M[0][0]/(M[0][0]+M[0][1]),4))
    print('\n\nCorrectly Predicted Received:\t\t',round(M[1][1],4))
    print('Incorrectly Predicted Received:\t\t',round(M[1][0],4))
    print('Accuracy Predicting Received:\t\t',round(M[1][1]/(M[1][0]+M[1][1]),4))
    
    print('\n',classification_report(y_test,Predict))
    
    # Print the feature ranking
    f.write('\n\nScore:\t\t\t'+str(round(Score,4))+
            '\n\nCorrectly Predicted Not Received :\t'+str(round(M[0][0],4))+
            '\nIncorrectly Predicted Not Received:\t'+str(round(M[0][1],4))+
            '\nAccuracy Predicting Not Received:\t'+str(round(M[0][0]/(M[0][0]+M[0][1]),4))+
            '\n\nCorrectly Predicted Received:\t\t'+str(round(M[1][1],4))+
            '\nIncorrectly Predicted Received:\t\t'+str(round(M[1][0],4))+
            '\nAccuracy Predicting Received:\t\t'+str(round(M[1][1]/(M[1][0]+M[1][1]),4))+
            '\n\n'+str(classification_report(y_test,Predict))+"\n\nFeature ranking:")
    
    for i in range(10):
        
        f.write("\n%d. %s "% (i + 1, X.columns[indices[i]]))
        f.write(str(round(100*importances[indices[i]],3))+'%')
        
    # Plot the feature importances of the forest in a bar graph
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices], color="r", align="center")
    plt.xticks(range(10), X.columns[indices],rotation='vertical')
    plt.xlim([-1,9.6])
    plt.savefig(name+' FEATURES.png')
    
    # Plotting the ROC Curve
    plt.figure()
    dom1 = [fpr[0],fpr[-1]]
    ran1 = [tpr[0],tpr[-1]]
    plt.plot(dom1,ran1,label='Base')
    plt.plot(fpr,tpr,label='ROC')
    plt.legend()
    plt.title('ROC Curve')
    plt.savefig(name+' ROC.png')
    f.close()
   
    return dom1,ran1

### Make Engineering Easier
__Overview:__ The Add_Dummies() function turns a specified column into a dummies dataframe and appends it to the input dataframe, deletnig the original column.

In [None]:
def Add_Dummies(df,column):
    
    """
    input:
        df     : dataframe - the dataframe to modify
        column : string - the column name to get the dummies for
    
    output:
        df : the modified dataframe
    """
    
    dum = pd.get_dummies(df[column]) # Create a dataframe of dummies from df at the specified 
    cols = dum.columns # The column names for dum
    
    for i in cols: # Iterate through each of the columns
    
        df[column+str(i)]=dum[i] # append the corresponding dummy column to df
    
    df = df.drop(column,axis=1) # delete the original column from df
    
    return df

### XGBoost Model
__Overview:__ The X_F() function creates a boosted forest with the given parameters and uses previously created functions to communicate results to the user.

In [None]:
def X_F(name,X_train,y_train,X_test,y_test,X,max_depth=5,n_estimators = 10,CV=False,Verbose=False,c=0):
    
    """
    input:
        name         : String - the user specified name of the model
        X_train      : Datframe - the train data for the model
        y_train      : Dataframe - the labels for the train data
        X_test       : Dataframe - the test data for the model
        y_test       : Dataframe - the labels for the test data
        X            : Dataframe - the entire dataset
        max_depth    : Integer - the max depth of the generated model
        n_estimators : Integer - the number of estimators to use in the model
        CV           : Boolean - If true, performs a k-fold cross validation
        Verbose      : Boolean - If true, regularaly outputs messages to inform the user what the function is doing
        c            : Integer - dummy variable
        
    output:
        A string summarizing the success/failure of the model's performance.
    """
    
    clear_output(wait=True) # Clear the output from a previous iteration, to manage whitespace
    display(str(100*(6+c)/16)+'%') # Show how much of the function is completed
    display('Complete\nBuilding XGB Classifier') 
    model2 = XGBClassifier(max_depth=max_depth,n_estimators=n_estimators) # Build the model
    
    if CV: # If the user wants to perform a cross validation
        clear_output(wait=True) # Clear the output from a previous iteration, to manage whitespace
        display(str(100*(7+c)/16)+'%') # Show how much of the function is completed
        display('Performing 7-Fold Cross Validation On Boosted Tree')
        kfold = KFold(n_splits = 7,shuffle=True,random_state = 42) # Perform the K-fold cross validation
        results = cross_val_score(model2,X_train,y_train,cv=kfold) # Get the cross validation score
    
    clear_output(wait=True) # Clear the output from a previous iteration, to manage whitespace
    display(str(100*(8+c)/16)+'%')
    display('Fitting Model')
    model2.fit(X_train,y_train) # Fit the model to the train data
  
    
    clear_output(wait=True) # Clear the output from a previous iteration, to manage whitespace
    display(str(100*(9+c)/16)+'%')
    display(' Scoring Model')
    pred = model2.predict(X_test) # Get the predicted score of the test data on the model
        
    clear_output(wait=True) # Clear the output from a previous iteration, to manage whitespace
    display(str(100*(10+c)/16)+'%')
    display('Calculating Metrics')
    
    #Calculate all of the necessary metrics of the generated model to score it.
    nM,nimportances,nindices,nScore,nfpr,ntpr,nroc_auc = calc_metrics(X_test,y_test,model2,pred,clf='XGB',roc=True)
  
    
    name = 'XGBOOSTED '+name
    
    if Verbose: # If verbose = True then report how the model did
        D1,R1 = Model_Report(name,y_train,nScore,nM,y_test,pred,nimportances,nindices,nfpr,ntpr,X,smote=True)
    
    if CV: # If CV = True then report how the cross validation performed
        return "Accuracy: "+str(round(results.mean()*100,1)), "Model Score: "+str(round(100*accuracy_score(y_test,pred),1))

### Putting Everything Together
__Overview:__ The run_model() function, wraps all previous functions together.

In [None]:
def run_model(name,df,depth='11',estims='10',clf='RF',roc=True,prepre=False,XF=False,GS=False,CV=False,verbose=False):
    
    """
    input:
        name    - String - user specified name for the model 
        df      - Dataframe - the dataset 
        depth   - Integer - the depth of the model
        estims  - Integer - the number of estimators to use in building the model
        clf     - String - 'RF','XGB' The type of model to build
        roc     - Boolean - if True, calculate the roc curve
        prepre  - Boolean - if True, prep the data using the data_prep()
        XF      - Boolean - if True use the X_F() function
        GS      - Boolean - if True perform a geridsearch on the model
        CV      - Boolean - if True perform a k-fold cross validation
        verbose - Boolean - if True, provides more data aabout the function/model prowess
    
    output:
        N/A
    """
    
    display(str(100*1/16)+'%')
    display('Prepping Data')
    X_train,X_test,y_train,y_test,X1 = data_prep(df,prepre=prepre)
    SX_train,SX_test,Sy_train,Sy_test,X2 = data_prep(df,smote=True,prepre=prepre)    
    
    clear_output(wait=True)
    display(str(100*2/16)+'%')
    display('Building Model')
    model,predict = Model(X_train,X_test,y_train,y_test,depth=11,estims=10,clf=clf)
    smodel,spredict = Model(SX_train,SX_test,Sy_train,Sy_test,depth=5,estims=35,clf=clf)
  
    
    if CV:
        clear_output(wait=True)
        display(str(100*3/16)+'%')
        display('Performing 7-Fold Cross Validation on Random Forest')
        kfold = KFold(n_splits = 7,shuffle=True,random_state = 42)
        results = cross_val_score(smodel,SX_train,Sy_train,cv=kfold)

    
    if GS and not XF:
        
        RF = RandomForestClassifier()
        RF_param_grid = {'n_estimators':[15,20,25,30],
                'max_depth':[11,13,15,17,19,21],
                 }

        RF_GS = GridSearchCV(RF, RF_param_grid, cv=4, scoring='f1', verbose=0)
        RF_GS.fit(SX_train,np.ravel(Sy_train))
        print(RF_GS.best_params_, RF_GS.best_score_)
        
        return 
    
    elif GS and XF:
        
        RF = XGBClassifier()
        RF_param_grid = {'n_estimators':[5,10,15],
                'max_depth':[8,9,10,11],
                 }

        RF_GS = GridSearchCV(RF, RF_param_grid, cv=4, scoring='f1', verbose=0)
        RF_GS.fit(SX_train,np.ravel(Sy_train))
        print(RF_GS.best_params_, RF_GS.best_score_)
        
        return 
    
    else:
        clear_output(wait=True)
        display(str(100*4/16)+'%')
        display('Calculating Metrics')
        M,importances,indices,Score,fpr,tpr,roc_auc = calc_metrics(X_test,y_test,model,predict,clf='RF',roc=True)
        M2,importances2,indices2,Score2,fpr2,tpr2,roc_auc2 = calc_metrics(SX_test,Sy_test,smodel,spredict,clf='RF',roc=True)
    
        name1 = 'SMOTED '+name
        name2 = 'NON-SMOTED '+name

        if XF:
            clear_output(wait=True)
            display(str(100*5/16)+'%')
            display('Starting Non-SMOTE XGB')
            if CV:
                K1,A1 = X_F(name2,X_train,y_train,X_test,y_test,X1,max_depth=5,n_estimators = 10,CV=CV)
            
            else:
                X_F(name2,X_train,y_train,X_test,y_test,X1,max_depth=5,n_estimators = 10,CV=CV)
     
            
            display('Starting SMOTE XGB')
            if CV:
                K2,A2 = X_F(name1,SX_train,Sy_train,SX_test,Sy_test,X2,max_depth=5,n_estimators = 10,CV=CV,c=1)
                clear_output(wait=True)
                print("First 7-Cross Fold Validation Score: ",K1)
                print(A1)
                print("Second 7-Cross Fold Validation Score: ",K2)
                print(A2)
            else:
                X_F(name1,SX_train,Sy_train,SX_test,Sy_test,X2,max_depth=5,n_estimators = 10,CV=CV,c=11)

        if verbose:
            D1,R1 = Model_Report(name,y_train,Score,M,y_test,predict,importances,indices,fpr,tpr,X1)
            D2,R2 = Model_Report(name,Sy_train,Score2,M2,Sy_test,spredict,importances2,indices2,fpr2,tpr2,X2,smote=True)

### Reading in the Data

In [None]:
pd.options.mode.chained_assignment = None

# Reading in the .csv file
opiates = pd.read_csv("./data_files/result_with_one_hot.csv")

# Opiates dataset containing all relevant features
df = opiates[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1','RFV2','RFV3',
              'RFV4','RFV5', 'DIAG1', 'REGION', 'YEAR','AGEDAYS','AGER','VDAYR',
              'MAJOR','TEMPF','BPSYS','BPDIAS','GLUCOSE','HTIN',
              'WTLB','BMI','BONEDENS','ETHUN','RACEUN','ETHIM','RACEIM','SUBSTED',
              'HTWTFL','PAYTYPE','received2']] 

# Opiates dataset containing only features people have no control over
#  (YEAR, DIAG1) were included for feature engineering purposes
deterministic = opiates[['REGION','YEAR','AGE','SEX','RACE','DIAG1','ETHNIC',
                         'TEMPF','BPSYS','BPDIAS','GLUCOSE','HTIN',
                         'WTLB','BMI','BONEDENS','HTWTFL','received2']]

# Opiates dataset containing only features people have control over
non_deterministic = opiates[['AGE','VMONTH','RFV1','RFV2','RFV3','RFV4',
                             'RFV5', 'DIAG1', 'REGION', 'YEAR','AGEDAYS',
                             'VDAYR','PAYTYPE','received2']] 

### Prepping the Data

In [None]:
df = generation_data(df,marching=True,start=20,end=60)
print('df Generation Completed')

det = generation_data(deterministic,marching=True,start=20,end=60)
clear_output(wait=True)
print('det Generation Completed')

non_det = generation_data(non_deterministic,marching=True,start=20,end=60)
clear_output(wait=True)
print('non_det Generation Completed')

df = data_prep(df,N=False,smote=False,pre=True,prepre=True)
clear_output(wait=True)
print('df Data Prepped')
features = ['VMONTH','RACE','ETHNIC','RFV1','SEX','VDAYR','ETHUN','RACEUN','ETHIM','RACEIM','MAJOR','PAYTYPE']

for i in tqdm(range(len(features))):
    df = Add_Dummies(df,features[i])


deterministic = data_prep(deterministic,N=False,smote=False,pre=True,prepre=True)
collls = deterministic.columns
features = ['SEX','RACE','ETHNIC',
            'TEMPF','BPSYS','BPDIAS','GLUCOSE','BONEDENS','HTWTFL']
clear_output(wait=True)
for i in tqdm(range(len(features))):
    deterministic = Add_Dummies(deterministic,features[i])
clear_output(wait=True)
# Opiates dataset containing only features people have control over
features = ['VMONTH','RFV1',
            'VDAYR','PAYTYPE']               
non_deterministic = data_prep(non_deterministic,N=False,smote=False,pre=True,prepre=True)
for i in tqdm(range(len(features))):
    non_deterministic = Add_Dummies(non_deterministic,features[i])
clear_output(wait=True)
print('Complete')
print(collls)

### Training the Model

In [None]:
name='DATASET OF DETERMINISTIC TRAITS'
run_model(name,det,prepre=True,XF=True,GS=False,CV=True)
# Non-XBF : max_depth = 21, n_estimators = 30

In [None]:
name = 'DATASET OF NON DETERMINISTIC TRAITS'
run_model(name,non_det,prepre=True,XF=True,CV=True,verbose=True)

In [None]:
start = time.time()
name='THE COMPLETE DATASET'
run_model(name,df,XF=True,CV=False)
end = time.time()
print('Time to Complete:',abs(start-end),' Seconds')

### Send Datasets to CSV Files

In [None]:
print('prepping')
X_train,X_test,y_train,y_test,X = data_prep(df,smote=True,prepre=False)
print('prepped')
y_test=pd.DataFrame(y_test)
print('ytest complete')
y_train=pd.DataFrame(y_train)
print('ytrain complete')
print('starting ...')
X_train.to_csv(r'./SMOTE_X_train.csv')
print('X_train complete')
X_test.to_csv(r'./SMOTE_X_test.csv')
print('X_test complete')
y_train.to_csv(r'./SMOTE_y_train.csv')
print('y_train complete')
y_test.to_csv(r'./SMOTE_y_test.csv')
print('y_test complete')