In [None]:
#Import modules
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
import pandas as pd
import csv
from csv import writer
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score
from flaml.data import get_output_from_log
import joblib
import os
from os.path import exists

In [None]:
#Read in data
Corpus = pd.read_csv(r'<path to file>\Full_Corpus.csv')
#Convert blurb text to string datatype
Corpus.text_final_blurb = Corpus.text_final_blurb.astype(str)
#Drop unnecessary columns
Corpus.drop(columns={'name','blurb','PercentileRank','Half','Twenties', 'Third','NumSuccess','NumFail'},axis=1,inplace=True,errors='ignore')
print(Corpus.columns)
#This is only run once to create the folder that will hold all of the models
#os.makedirs("outputs", exist_ok=True)

In [None]:
#Start training models
#Each of the four questions (both pre and post launch, will the project be successful, and how much money can we expect to make)
#will have code run to produce a model for each combination of category and subcategory. 
#This is 156 combinations x 4 questions = 624 models

In [None]:
#Function to write to file
#Note: this block is not written by me, it was taken from here:
##https://thispointer.com/python-how-to-append-a-new-row-to-an-existing-csv-file/#:~:text=Append%20a%20dictionary%20as%20a%20row%20to%20an,the%20csv%20file%2C%20now%20close%20the%20file%20object%2C
def append_list_as_row(file_name, list_of_elem):
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        csv_writer = writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(list_of_elem)

In [None]:
#Predicting success, pre-launch
#Set the predictor and outcome variables
predictors = ['text_final_blurb','DaysDiffCreateLaunch', 'DaysDiffLaunchDeadline',
       'CreatedDate_Month', 'CreatedDate_DayOfMonth', 'CreatedDate_Year', 'CreatedDate_DayOfWeek', 
       'DeadLineDate_Month', 'DeadLineDate_DayOfMonth', 'DeadLineDate_Year', 'DeadLineDate_DayOfWeek', 
        'LaunchedDate_Month', 'LaunchedDate_DayOfMonth', 'LaunchedDate_Year', 'LaunchedDate_DayOfWeek', 
        'NumPrevSuccess', 'NumPrevFail', 'goal_21']
outcome = 'state_successful'

#Counter variables
NumResults = 0
currGroup = 1

#Check if there was a previous partial run
prevRun = pd.DataFrame(columns=['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','Accuracy'])
if os.path.exists(r'<path to file>\PreLaunchSuccess.csv'):
    prevRun = pd.read_csv(r'<path to file>\PreLaunchSuccess.csv')
    prevRun.Name = prevRun.Name.astype(str)

#Counts the number of combinations so we know how many models will be produced
#This will let us create a progress indicator used during training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    NumResults += 1

#Start the actual training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    print(name)    
    
    if not (str(name) in prevRun['Name'].unique()):
        #Temp dataframe used below
        part_df = Corpus

        #This block slices up the name variable for use in naming the model files
        #Example: the name is (1.0,220), this turns it into [1.0,220]
        nameSplit = str(name)
        nameSplit = nameSplit.replace('(','')
        nameSplit = nameSplit.replace(')','')
        nameSplit = nameSplit.replace(' ','')
        nameSplit = nameSplit.split(',')

        #Grab the data associated with the predictor and outcome variables specified above
        X = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][predictors]
        y = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][outcome]

        #This block runs tfidf on the blurb text and concatenates the resulting matrix to the X variable
        vBlurb = TfidfVectorizer()
        enc_blurb = vBlurb.fit_transform(X['text_final_blurb'])
        df_enc_blurb = pd.DataFrame(enc_blurb.toarray(), columns=vBlurb.get_feature_names())
        X.drop('text_final_blurb', axis=1, inplace=True)
        df_enc_blurb.reset_index(drop=True,inplace=True)
        X.reset_index(drop=True,inplace=True)
        res = pd.concat([X, df_enc_blurb], axis=1)

        #Splitting the dataset on an 80/20 split
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(res,y,test_size=0.2)

        #Using a try-except block in case a model doesn't train properly
        try:
            #Using AutoML and a beginning time budget of 10 minutes
            #If it can't train a model in that time, it will rerun but add 5 minutes each time
            #If it still can't find a model after ~39 minutes, it will fail into the except block
            budgTime = 600
            modelFound = False
            while (modelFound == False) and (budgTime < 2361):
                automl = AutoML()
                settings = {
                    "time_budget": budgTime,  # seconds
                    "metric": 'accuracy', # metric used for the evaluation
                    "task": 'classification', # type of the task
                    "early_stop": True
                }

                automl.fit(X_train=X_train, y_train=Y_train, **settings)

                if (automl.best_estimator is not None):
                    modelFound = True

                budgTime += 300

            print('Best Machine Learning Algorithm:', automl.best_estimator)
            print('Best hyperparmeter configuration:', automl.best_config)
            print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
            print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

            #Make predictions
            preds = automl.predict(X_test)

            #Create a new list to store the values above and also the accuracy of the model as found with our predictions
            #Then append to the ongoing dataframe
            dfCalc = [name,automl.best_estimator,1-automl.best_loss,automl.best_config_train_time,automl.best_config,accuracy_score(Y_test, preds)]

            #Export the model and vocabulary to separate files
            #Vocabulary is needed for testing the blurb text entered by the end user of the tool
            fileStr = "presucc-"+str(nameSplit[0])+"-"+str(nameSplit[1])
            joblib.dump(value=automl, filename="outputs/"+fileStr+".pkl")
            joblib.dump(value=vBlurb, filename="outputs/"+fileStr+"-vocab.pkl")

            #Set the column names of the dataframe, print it to screen, then write the results to a file
            if not os.path.exists(r'<path to file>\PreLaunchSuccess.csv'):
                dfCalc = [dfCalc]
                df = pd.DataFrame(dfCalc,columns=['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','Accuracy'])
                df.to_csv(r'<path to file>\PreLaunchSuccess.csv',index=False)
            else:
                append_list_as_row(r'<path to file>\PreLaunchSuccess.csv', dfCalc)            
        except:
            print("Failed")
        
    #Print the progress for the person doing the training
    print((currGroup/NumResults) * 100,"% complete")
    currGroup += 1

In [None]:
#Predicting success, post-launch
#Set the predictor and outcome variables
predictors = ['text_final_blurb','DaysDiffCreateLaunch', 'DaysDiffLaunchDeadline',
       'CreatedDate_Month', 'CreatedDate_DayOfMonth', 'CreatedDate_Year', 'CreatedDate_DayOfWeek', 
       'DeadLineDate_Month', 'DeadLineDate_DayOfMonth', 'DeadLineDate_Year', 'DeadLineDate_DayOfWeek', 
        'LaunchedDate_Month', 'LaunchedDate_DayOfMonth', 'LaunchedDate_Year', 'LaunchedDate_DayOfWeek', 
        'NumPrevSuccess', 'NumPrevFail', 'spotlight_True', 'staff_pick_True', 'goal_21']
outcome = 'state_successful'

#Counter variables
NumResults = 0
currGroup = 1

#Check if there was a previous partial run
prevRun = pd.DataFrame(columns=['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','Accuracy'])
if os.path.exists(r'<path to file>\PostLaunchSuccess.csv'):
    prevRun = pd.read_csv(r'<path to file>\PostLaunchSuccess.csv')
    prevRun.Name = prevRun.Name.astype(str)

#Counts the number of combinations so we know how many models will be produced
#This will let us create a progress indicator used during training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    NumResults += 1

#Start the actual training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    print(name)    
    if not (str(name) in prevRun['Name'].unique()):
        #Temp dataframe used below
        part_df = Corpus

        #This block slices up the name variable for use in naming the model files
        #Example: the name is (1.0,220), this turns it into [1.0,220]
        nameSplit = str(name)
        nameSplit = nameSplit.replace('(','')
        nameSplit = nameSplit.replace(')','')
        nameSplit = nameSplit.replace(' ','')
        nameSplit = nameSplit.split(',')

        #Grab the data associated with the predictor and outcome variables specified above
        X = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][predictors]
        y = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][outcome]

        #This block runs tfidf on the blurb text and concatenates the resulting matrix to the X variable
        vBlurb = TfidfVectorizer()
        enc_x = vBlurb.fit_transform(X['text_final_blurb'])
        df_enc_x = pd.DataFrame(enc_x.toarray(), columns=vBlurb.get_feature_names())
        X.drop('text_final_blurb', axis=1, inplace=True)
        df_enc_x.reset_index(drop=True,inplace=True)
        X.reset_index(drop=True,inplace=True)
        res = pd.concat([X, df_enc_x], axis=1)

        #Splitting the dataset on an 80/20 split
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(res,y,test_size=0.2)

        #Using a try-except block in case a model doesn't train properly
        try:
            #Using AutoML and a beginning time budget of 10 minutes
            #If it can't train a model in that time, it will rerun but add 5 minutes each time
            #If it still can't find a model after ~39 minutes, it will fail into the except block
            budgTime = 600
            modelFound = False
            while (modelFound == False) and (budgTime < 1381):
                automl = AutoML()
                settings = {
                    "time_budget": budgTime,  # seconds
                    "metric": 'accuracy', # metric used for the evaluation
                    "task": 'classification', # type of the task
                    "early_stop": True
                }

                automl.fit(X_train=X_train, y_train=Y_train,
                           **settings)

                if (automl.best_estimator is not None):
                    modelFound = True

                budgTime += 300

            print('Best Machine Learning Algorithm:', automl.best_estimator)
            print('Best hyperparmeter configuration:', automl.best_config)
            print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
            print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

            #Make predictions
            preds = automl.predict(X_test)

            #Create a new list to store the values above and also the accuracy of the model as found with our predictions
            #Then append to the ongoing dataframe
            dfCalc = [[name,automl.best_estimator,1-automl.best_loss,automl.best_config_train_time,automl.best_config,accuracy_score(Y_test, preds)]]

            #Export the model and vocabulary to separate files
            #Vocabulary is needed for testing the blurb text entered by the end user of the tool
            fileStr = "postsucc-"+str(nameSplit[0])+"-"+str(nameSplit[1])
            joblib.dump(value=automl, filename="outputs/"+fileStr+".pkl")
            joblib.dump(value=vBlurb, filename="outputs/"+fileStr+"-vocab.pkl")

            #Set the column names of the dataframe, print it to screen, then write the results to a file
            if not os.path.exists(r'<path to file>\PostLaunchSuccess.csv'):
                dfCalc = [dfCalc]
                df.columns = ['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','Accuracy']
                df.to_csv(r'<path to file>\PostLaunchSuccess.csv',index=False)
            else:
                append_list_as_row(r'<path to file>\PostLaunchSuccess.csv', dfCalc)   
        except:
            print("Failed")
        
    #Print the progress for the person doing the training
    print((currGroup/NumResults) * 100,"% complete")
    currGroup += 1

In [None]:
#Predicting final funding amount pre-launch
#Set the predictor and outcome variables
predictors = ['text_final_blurb','DaysDiffCreateLaunch', 'DaysDiffLaunchDeadline',
       'CreatedDate_Month', 'CreatedDate_DayOfMonth', 'CreatedDate_Year', 'CreatedDate_DayOfWeek', 
       'DeadLineDate_Month', 'DeadLineDate_DayOfMonth', 'DeadLineDate_Year', 'DeadLineDate_DayOfWeek', 
        'LaunchedDate_Month', 'LaunchedDate_DayOfMonth', 'LaunchedDate_Year', 'LaunchedDate_DayOfWeek', 
        'NumPrevSuccess', 'NumPrevFail', 'goal_21']
outcome = 'Tens'

#Counter variables
NumResults = 0
currGroup = 1

#Check if there was a previous partial run
prevRun = pd.DataFrame(columns=['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','Accuracy'])
if os.path.exists(r'<path to file>\PreLaunchFunding.csv'):
    prevRun = pd.read_csv(r'<path to file>\PreLaunchFunding.csv')
    prevRun.Name = prevRun.Name.astype(str)

#Counts the number of combinations so we know how many models will be produced
#This will let us create a progress indicator used during training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    NumResults += 1

#Start the actual training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    print(name)    
    if not (str(name) in prevRun['Name'].unique()):
        #Temp dataframe used below
        part_df = Corpus

        #This block slices up the name variable for use in naming the model files
        #Example: the name is (1.0,220), this turns it into [1.0,220]
        nameSplit = str(name)
        nameSplit = nameSplit.replace('(','')
        nameSplit = nameSplit.replace(')','')
        nameSplit = nameSplit.replace(' ','')
        nameSplit = nameSplit.split(',')

        #Grab the data associated with the predictor and outcome variables specified above
        X = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][predictors]
        y = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][outcome]

        #This block runs tfidf on the blurb text and concatenates the resulting matrix to the X variable
        vBlurb = TfidfVectorizer()
        enc_x = vBlurb.fit_transform(X['text_final_blurb'])
        df_enc_x = pd.DataFrame(enc_x.toarray(), columns=vBlurb.get_feature_names())
        X.drop('text_final_blurb', axis=1, inplace=True)
        df_enc_x.reset_index(drop=True,inplace=True)
        X.reset_index(drop=True,inplace=True)
        res = pd.concat([X, df_enc_x], axis=1)

        #Splitting the dataset on an 80/20 split
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(res,y,test_size=0.2)

        #Using a try-except block in case a model doesn't train properly
        try:
            #Using AutoML and a beginning time budget of 10 minutes
            #If it can't train a model in that time, it will rerun but add 5 minutes each time
            #If it still can't find a model after ~39 minutes, it will fail into the except block
            budgTime = 600
            modelFound = False
            while (modelFound == False) and (budgTime < 1381):
                automl = AutoML()
                settings = {
                    "time_budget": budgTime,  # seconds
                    "metric": 'accuracy', # metric used for the evaluation
                    "task": 'classification', # type of the task
                    "early_stop": True
                }

                automl.fit(X_train=X_train, y_train=Y_train,
                           **settings)

                if (automl.best_estimator is not None):
                    modelFound = True

                budgTime += 300

            print('Best Machine Learning Algorithm:', automl.best_estimator)
            print('Best hyperparmeter configuration:', automl.best_config)
            print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
            print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

            #Make predictions; these predictions are then compared to the actual values
            #The difference between the two are calculated, and written to a dataframe
            #This dataframe is then written to the file
            preds = automl.predict(X_test)
            chkDF = pd.DataFrame()
            realDF = pd.DataFrame(Y_test)
            realDF.reset_index(drop=True,inplace=True)

            rfDF = pd.DataFrame(preds)
            rfDF.reset_index(drop=True,inplace=True)
            chkDF = pd.concat([realDF, rfDF], axis=1)
            chkDF.columns = ['Original','RF_Prediction']
            chkDF['Difference'] = abs(chkDF['Original'] - chkDF['RF_Prediction'])
            countVals = pd.DataFrame(chkDF['Difference'].value_counts())
            countValsDF = countVals.reset_index()
            countValsDF.columns = ['Diff','Num']
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0)]
            calc0 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0)]
            calc10 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0)]
            calc20 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0)]
            calc30 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0)]
            calc40 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0)]
            calc50 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0)]
            calc60 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0) | (countValsDF['Diff'] == 70.0)]
            calc70 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0) | (countValsDF['Diff'] == 70.0) | (countValsDF['Diff'] == 80.0)]
            calc80 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0) | (countValsDF['Diff'] == 70.0) | (countValsDF['Diff'] == 80.0) | (countValsDF['Diff'] == 90.0)]
            calc90 = calcDF['Num'].sum()/sum(countValsDF['Num'])

            #Create a new list to store the values above and also the accuracy of the model as found with our predictions
            #Then append to the ongoing dataframe
            dfCalc = [[name,automl.best_estimator,1-automl.best_loss,automl.best_config_train_time,automl.best_config,calc0,calc10,calc20,calc30,calc40,calc50,calc60,calc70,calc80,calc90]]

            #Export the model and vocabulary to separate files
            #Vocabulary is needed for testing the blurb text entered by the end user of the tool
            fileStr = "prefund-"+str(nameSplit[0])+"-"+str(nameSplit[1])
            joblib.dump(value=automl, filename="outputs/"+fileStr+".pkl")
            joblib.dump(value=vBlurb, filename="outputs/"+fileStr+"-vocab.pkl")

            #Set the column names of the dataframe, print it to screen, then write the results to a file
            if not os.path.exists(r'<path to file>\PreLaunchFunding.csv'):
                dfCalc = [dfCalc]
                df = pd.DataFrame(dfCalc,columns = ['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','0', '10', '20','30','40','50','60','70','80','90'])
                df.to_csv(r'<path to file>\PreLaunchFunding.csv',index=False)
            else:
                append_list_as_row(r'<path to file>\PreLaunchFunding.csv', dfCalc)     
        except:
            print("Failed")
        
    #Print the progress for the person doing the training
    print((currGroup/NumResults) * 100,"% complete")
    currGroup += 1

In [None]:
#Predicting final funding amount post-launch
#Set the predictor and outcome variables
predictors = ['text_final_blurb','DaysDiffCreateLaunch', 'DaysDiffLaunchDeadline',
       'CreatedDate_Month', 'CreatedDate_DayOfMonth', 'CreatedDate_Year', 'CreatedDate_DayOfWeek', 
       'DeadLineDate_Month', 'DeadLineDate_DayOfMonth', 'DeadLineDate_Year', 'DeadLineDate_DayOfWeek', 
        'LaunchedDate_Month', 'LaunchedDate_DayOfMonth', 'LaunchedDate_Year', 'LaunchedDate_DayOfWeek', 
        'NumPrevSuccess', 'NumPrevFail', 'spotlight_True', 'staff_pick_True', 'goal_21']
outcome = 'Tens'

#Counter variables
NumResults = 0
currGroup = 1

#Check if there was a previous partial run
prevRun = pd.DataFrame(columns=['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','Accuracy'])
if os.path.exists(r'<path to file>\PostLaunchFunding.csv'):
    prevRun = pd.read_csv(r'<path to file>\PostLaunchFunding.csv')
    prevRun.Name = prevRun.Name.astype(str)

#Counts the number of combinations so we know how many models will be produced
#This will let us create a progress indicator used during training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    NumResults += 1

#Start the actual training
for name, group in Corpus.groupby(['CatID', 'SubCatID']):
    print(name)    
    
    if not (str(name) in prevRun['Name'].unique()):
        #Temp dataframe used below
        part_df = Corpus

        #This block slices up the name variable for use in naming the model files
        #Example: the name is (1.0,220), this turns it into [1.0,220]
        nameSplit = str(name)
        nameSplit = nameSplit.replace('(','')
        nameSplit = nameSplit.replace(')','')
        nameSplit = nameSplit.replace(' ','')
        nameSplit = nameSplit.split(',')

        #Grab the data associated with the predictor and outcome variables specified above
        X = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][predictors]
        y = part_df[(part_df.CatID == name[0])&(part_df.SubCatID==name[1])][outcome]

        #This block runs tfidf on the blurb text and concatenates the resulting matrix to the X variable
        vBlurb = TfidfVectorizer()
        enc_x = vBlurb.fit_transform(X['text_final_blurb'])
        df_enc_x = pd.DataFrame(enc_x.toarray(), columns=vBlurb.get_feature_names())
        X.drop('text_final_blurb', axis=1, inplace=True)
        df_enc_x.reset_index(drop=True,inplace=True)
        X.reset_index(drop=True,inplace=True)
        res = pd.concat([X, df_enc_x], axis=1)
        v = TfidfVectorizer()

        #Splitting the dataset on an 80/20 split
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(res,y,test_size=0.2)

        #Using a try-except block in case a model doesn't train properly
        try:
            #Using AutoML and a beginning time budget of 10 minutes
            #If it can't train a model in that time, it will rerun but add 5 minutes each time
            #If it still can't find a model after ~39 minutes, it will fail into the except block
            budgTime = 600
            modelFound = False
            while (modelFound == False) and (budgTime < 1381):
                automl = AutoML()
                settings = {
                    "time_budget": budgTime,  # seconds
                    "metric": 'accuracy', # metric used for the evaluation
                    "task": 'classification', # type of the task
                    "early_stop": True
                }

                automl.fit(X_train=X_train, y_train=Y_train,
                           **settings)

                if (automl.best_estimator is not None):
                    modelFound = True

                budgTime += 300

            print('Best Machine Learning Algorithm:', automl.best_estimator)
            print('Best hyperparmeter configuration:', automl.best_config)
            print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
            print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

            #Make predictions; these predictions are then compared to the actual values
            #The difference between the two are calculated, and written to a dataframe
            #This dataframe is then written to the file
            preds = automl.predict(X_test)
            chkDF = pd.DataFrame()
            realDF = pd.DataFrame(Y_test)
            realDF.reset_index(drop=True,inplace=True)

            rfDF = pd.DataFrame(preds)
            rfDF.reset_index(drop=True,inplace=True)
            chkDF = pd.concat([realDF, rfDF], axis=1)
            chkDF.columns = ['Original','RF_Prediction']
            chkDF['Difference'] = abs(chkDF['Original'] - chkDF['RF_Prediction'])
            #print(chkDF['Difference'].value_counts())
            countVals = pd.DataFrame(chkDF['Difference'].value_counts())
            countValsDF = countVals.reset_index()
            countValsDF.columns = ['Diff','Num']
            #print(countValsDF[countValsDF['Diff'] == 0.0])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0)]
            calc0 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0)]
            calc10 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0)]
            calc20 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0)]
            calc30 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0)]
            calc40 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0)]
            calc50 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0)]
            calc60 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0) | (countValsDF['Diff'] == 70.0)]
            calc70 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0) | (countValsDF['Diff'] == 70.0) | (countValsDF['Diff'] == 80.0)]
            calc80 = calcDF['Num'].sum()/sum(countValsDF['Num'])
            calcDF = countValsDF[(countValsDF['Diff'] == 0.0) | (countValsDF['Diff'] == 10.0) | (countValsDF['Diff'] == 20.0) | (countValsDF['Diff'] == 30.0) | (countValsDF['Diff'] == 40.0) | (countValsDF['Diff'] == 50.0) | (countValsDF['Diff'] == 60.0) | (countValsDF['Diff'] == 70.0) | (countValsDF['Diff'] == 80.0) | (countValsDF['Diff'] == 90.0)]
            calc90 = calcDF['Num'].sum()/sum(countValsDF['Num'])

            #Create a new list to store the values above and also the accuracy of the model as found with our predictions
            #Then append to the ongoing dataframe
            dfCalc = [[name,automl.best_estimator,1-automl.best_loss,automl.best_config_train_time,automl.best_config,calc0,calc10,calc20,calc30,calc40,calc50,calc60,calc70,calc80,calc90]]

            #Export the model and vocabulary to separate files
            #Vocabulary is needed for testing the blurb text entered by the end user of the tool
            fileStr = "postfund-"+str(nameSplit[0])+"-"+str(nameSplit[1])
            joblib.dump(value=automl, filename="outputs/"+fileStr+".pkl")
            joblib.dump(value=vBlurb, filename="outputs/"+fileStr+"-vocab.pkl")

            #Set the column names of the dataframe, print it to screen, then write the results to a file
            if not os.path.exists(r'<path to file>\PostLaunchFunding.csv'):
                dfCalc = [dfCalc]
                df = pd.DataFrame(dfCalc,columns = ['Name', 'Estimator', 'Best Accuracy', 'Training Time','Best Config','0', '10', '20','30','40','50','60','70','80','90'])
                df.to_csv(r'<path to file>\PostLaunchFunding.csv',index=False)
            else:
                append_list_as_row(r'<path to file>\PostLaunchFunding.csv', dfCalc)     
        except:
            print("Failed")
        
    #Print the progress for the person doing the training
    print((currGroup/NumResults) * 100,"% complete")
    currGroup += 1