In [1]:
import pandas as pd
import regex as re
from tqdm import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import lightgbm as lgbm
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import pickle
# loading the tensorflow universal sentence encoder 4 model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

2021-11-18 13:01:47.162498: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# set dataframe display settings to show 1000 rows
pd.options.display.max_rows = 1000

In [3]:
# prep a data set with a column named 'cleanText' for model prediction
def prepDataForTextModel(predictionData):
    X_train = []
    for r in tqdm(predictionData.cleanText.values):
        emb = embed([r])
        review_emb = tf.reshape(emb, [-1]).numpy()
        X_train.append(review_emb)
    predictionData = np.array(X_train)
    return predictionData

In [4]:
# prep a dataset containing the columns ['emojiCount', 'followers', 'following','hashtagCount', 'urlCount',
# 'tweetCount', 'verified'] which are the required features for prediction
def prepDataForModel(predictionData):
    predictionData = predictionData[['emojiCount', 'followers', 'following', 'hashtagCount', 'urlCount', 'tweetCount', 'verified']]
    return predictionData

In [5]:
# creates trainingSet from human and bot dataset
def munge():
    # read bot and human datasets
    trainHumans = pd.read_csv('trainingDataHuman.csv')
    trainBots = pd.read_csv('trainingDataBots.csv')
    
    # Combine bot and human dataframes to create full trainingSet, then schuffle them
    frames = [trainHumans, trainBots]
    trainingSet = pd.concat(frames)
    trainingSet = trainingSet.sample(frac=1).reset_index(drop=True)
    
    # replacing na values in trainingSet with 'empty'
    trainingSet["cleanText"].fillna("empty", inplace = True)
    # changing true and false to 1 and 0
    trainingSet.verified = trainingSet.verified.replace({True: 1, False: 0})
    
    # embed clean text
    trainingSet['embededText'] = trainingSet.apply(lambda row : tf.reshape(embed([row["cleanText"]]),[-1]).numpy(), axis = 1)
    
    return trainingSet

In [20]:
# gets performance metrics from any set of true and predicted values
def getMetrics(y_true, y_pred):
    print(confusion_matrix(y_true, y_pred))
    print("Accuracy: " + str(accuracy_score(y_true, y_pred)))
    print("Precision: " + str(precision_score(y_true, y_pred)))
    print("Recall: " + str(recall_score(y_true, y_pred)))

In [7]:
# trains a text model given a training dataset
def trainTextModel(trainingSet):
    print(trainingSet.columns)
    X_train = []
    for r in tqdm(trainingSet.cleanText.values):
        emb = embed([r])
        review_emb = tf.reshape(emb, [-1]).numpy()
        X_train.append(review_emb)

    X_train = np.array(X_train)
    y_train = trainingSet.human.values

    validationSet = X_train[1846:]
    validation_y = trainingSet.iloc[1846:]
    X_train = X_train[0:1845]
    y_train = y_train[0:1845]
    
    # split data into train and testing data
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2, random_state = 42)
    
    # get the train and test data for the training sequence
    train_data = lgbm.Dataset(X_train, label=y_train)
    test_data = lgbm.Dataset(X_test, label=y_test)

    # parameters we'll use for the prediction and pass to lightgbm model
    parameters = {
        'metric': 'auc',
        'boosting': 'dart',
        'num_leaves': 31,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 20,
        'learning_rate': 0.05,
        'verbose': 0
    }
    parameters,
    # train the classifier
    classifier = lgbm.train(parameters,
                            train_data,
                            valid_sets= test_data,
                            num_boost_round=1000,
                            early_stopping_rounds=100)
    
    # PREDICTION - Make prediction on validation dataset
    val_pred = classifier.predict(validationSet)

    # Insert predictions into validation dataframe
    validation_y.insert(1, "Pred", val_pred, True)
    
    # Round continuous predictions into true or false (0 or 1)
    validation_y['Pred'] = validation_y['Pred'].apply(lambda x: round(x))
    
    y_true = validation_y['human']
    y_pred = validation_y['Pred']
    
    getMetrics(y_true, y_pred)
    
    return classifier

In [8]:
# trains and returns model which accepts the following features
# 'emojiCount', 'followers', 'following', 'hashtagCount', 'urlCount', 'tweetCount', 'verified'
def trainModel(trainingSet):
    validationSet = trainingSet.tail(300)
    validation_y = trainingSet.tail(300)
    
    validationSet.drop(columns=['id', 'rawText', 'cleanText', 'human', 'embededText'], inplace=True)
    
    X_train = trainingSet.head(len(trainingSet)-300)
    y_train = trainingSet.head(len(trainingSet)-300).human.values
    
    X_train.drop(columns=['id', 'rawText', 'cleanText', 'human', 'embededText'], inplace=True)
    
    print(X_train.columns)
    # split data into train and testing data
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2, random_state = 42)
    
    # get the train and test data for the training sequence
    train_data = lgbm.Dataset(X_train, label=y_train)
    test_data = lgbm.Dataset(X_test, label=y_test)

    # parameters we'll use for the prediction and pass to lightgbm model
    parameters = {
        'metric': 'auc',
        'boosting': 'dart',
        'num_leaves': 31,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 20,
        'learning_rate': 0.05,
        'verbose': 0
    }
    parameters,
    # train the classifier
    classifier = lgbm.train(parameters,
                            train_data,
                            valid_sets= test_data,
                            num_boost_round=1000,
                            early_stopping_rounds=100)
    
    # PREDICTION - Make prediction on validation dataset
    val_pred = classifier.predict(validationSet)

    # Insert predictions into validation dataframe
    validation_y.insert(1, "Pred", val_pred, True)
    
    # Round continuous predictions into true or false (0 or 1)
    validation_y['Pred'] = validation_y['Pred'].apply(lambda x: round(x))
    
    y_true = validation_y['human']
    y_pred = validation_y['Pred']
    
    getMetrics(y_true, y_pred)
    
    return classifier

In [9]:
# get a prediction from the ensamble model which creates a composite pred as a result of the output of the 
# textModel and standard model
def ensambleModel(textModel, model, predictionData):
    preppedTextData = prepDataForTextModel(predictionData)
    textPrediction = textModel.predict(preppedTextData)
    preppedModelData = prepDataForModel(predictionData)
    modelPrediction = model.predict(preppedModelData)
    
    compositePred = []
    
    for index in range(len(textPrediction)):
        compositePred.append(abs((textPrediction[index] + modelPrediction[index])/2))
        
    return compositePred
    

In [10]:
trainingSet = munge()

In [11]:
textModel = trainTextModel(trainingSet)

Index(['id', 'rawText', 'cleanText', 'emojiCount', 'followers', 'following',
       'hashtagCount', 'urlCount', 'tweetCount', 'verified', 'human',
       'embededText'],
      dtype='object')


100%|██████████████████████████████████████| 2146/2146 [00:05<00:00, 426.53it/s]


You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.83519
[2]	valid_0's auc: 0.863944
[3]	valid_0's auc: 0.887039
[4]	valid_0's auc: 0.894053
[5]	valid_0's auc: 0.905295
[6]	valid_0's auc: 0.90458
[7]	valid_0's auc: 0.91149
[8]	valid_0's auc: 0.916314
[9]	valid_0's auc: 0.913932
[10]	valid_0's auc: 0.913247
[11]	valid_0's auc: 0.91557
[12]	valid_0's auc: 0.91417
[13]	valid_0's auc: 0.914587
[14]	valid_0's auc: 0.918488
[15]	valid_0's auc: 0.918935
[16]	valid_0's auc: 0.918995
[17]	valid_0's auc: 0.919858
[18]	valid_0's auc: 0.919412
[19]	valid_0's auc: 0.920096
[20]	valid_0's auc: 0.920365
[21]	valid_0's auc: 0.919739
[22]	valid_0's auc: 0.921943
[23]	valid_0's auc: 0.92102
[24]	valid_0's auc: 0.91962
[25]	valid_0's auc: 0.919382
[26]	valid_0's auc: 0.918578
[27]	valid_0's auc: 0.920156
[28]	valid_0's auc: 0.920275
[29]	valid_0's auc: 0.921228
[30]	valid_0's auc: 0.920752
[31]	valid_0's auc: 0.920484
[32]	valid_0's auc: 0.919948
[33]	valid_0's auc: 0.919739
[

[106]	valid_0's auc: 0.924028
[107]	valid_0's auc: 0.923849
[108]	valid_0's auc: 0.924474
[109]	valid_0's auc: 0.924355
[110]	valid_0's auc: 0.924504
[111]	valid_0's auc: 0.924564
[112]	valid_0's auc: 0.925636
[113]	valid_0's auc: 0.924117
[114]	valid_0's auc: 0.924057
[115]	valid_0's auc: 0.924534
[116]	valid_0's auc: 0.922836
[117]	valid_0's auc: 0.923194
[118]	valid_0's auc: 0.922032
[119]	valid_0's auc: 0.921913
[120]	valid_0's auc: 0.922062
[121]	valid_0's auc: 0.922211
[122]	valid_0's auc: 0.922151
[123]	valid_0's auc: 0.921705
[124]	valid_0's auc: 0.922241
[125]	valid_0's auc: 0.921824
[126]	valid_0's auc: 0.922062
[127]	valid_0's auc: 0.922211
[128]	valid_0's auc: 0.921913
[129]	valid_0's auc: 0.921407
[130]	valid_0's auc: 0.921109
[131]	valid_0's auc: 0.921169
[132]	valid_0's auc: 0.921318
[133]	valid_0's auc: 0.920692
[134]	valid_0's auc: 0.920811
[135]	valid_0's auc: 0.919799
[136]	valid_0's auc: 0.919888
[137]	valid_0's auc: 0.919858
[138]	valid_0's auc: 0.919977
[139]	vali

[205]	valid_0's auc: 0.924713
[206]	valid_0's auc: 0.923789
[207]	valid_0's auc: 0.923819
[208]	valid_0's auc: 0.924028
[209]	valid_0's auc: 0.924266
[210]	valid_0's auc: 0.924117
[211]	valid_0's auc: 0.924266
[212]	valid_0's auc: 0.924028
[213]	valid_0's auc: 0.923819
[214]	valid_0's auc: 0.923343
[215]	valid_0's auc: 0.922509
[216]	valid_0's auc: 0.92236
[217]	valid_0's auc: 0.922271
[218]	valid_0's auc: 0.922151
[219]	valid_0's auc: 0.922271
[220]	valid_0's auc: 0.922122
[221]	valid_0's auc: 0.922211
[222]	valid_0's auc: 0.922479
[223]	valid_0's auc: 0.921883
[224]	valid_0's auc: 0.921854
[225]	valid_0's auc: 0.921854
[226]	valid_0's auc: 0.921705
[227]	valid_0's auc: 0.921943
[228]	valid_0's auc: 0.922539
[229]	valid_0's auc: 0.922658
[230]	valid_0's auc: 0.922687
[231]	valid_0's auc: 0.922568
[232]	valid_0's auc: 0.92233
[233]	valid_0's auc: 0.922509
[234]	valid_0's auc: 0.92239
[235]	valid_0's auc: 0.922479
[236]	valid_0's auc: 0.923075
[237]	valid_0's auc: 0.922628
[238]	valid_0

[289]	valid_0's auc: 0.925338
[290]	valid_0's auc: 0.925546
[291]	valid_0's auc: 0.92641
[292]	valid_0's auc: 0.926202
[293]	valid_0's auc: 0.926202
[294]	valid_0's auc: 0.925457
[295]	valid_0's auc: 0.925606
[296]	valid_0's auc: 0.925815
[297]	valid_0's auc: 0.926083
[298]	valid_0's auc: 0.925993
[299]	valid_0's auc: 0.926142
[300]	valid_0's auc: 0.92638
[301]	valid_0's auc: 0.92644
[302]	valid_0's auc: 0.926261
[303]	valid_0's auc: 0.926321
[304]	valid_0's auc: 0.926291
[305]	valid_0's auc: 0.926351
[306]	valid_0's auc: 0.926351
[307]	valid_0's auc: 0.926261
[308]	valid_0's auc: 0.92641
[309]	valid_0's auc: 0.926529
[310]	valid_0's auc: 0.927393
[311]	valid_0's auc: 0.928018
[312]	valid_0's auc: 0.928108
[313]	valid_0's auc: 0.928137
[314]	valid_0's auc: 0.927989
[315]	valid_0's auc: 0.928078
[316]	valid_0's auc: 0.928257
[317]	valid_0's auc: 0.929001
[318]	valid_0's auc: 0.92909
[319]	valid_0's auc: 0.92915
[320]	valid_0's auc: 0.92909
[321]	valid_0's auc: 0.92909
[322]	valid_0's au

[381]	valid_0's auc: 0.932307
[382]	valid_0's auc: 0.932307
[383]	valid_0's auc: 0.93186
[384]	valid_0's auc: 0.932039
[385]	valid_0's auc: 0.932098
[386]	valid_0's auc: 0.932039
[387]	valid_0's auc: 0.932158
[388]	valid_0's auc: 0.93192
[389]	valid_0's auc: 0.932277
[390]	valid_0's auc: 0.932515
[391]	valid_0's auc: 0.932396
[392]	valid_0's auc: 0.932754
[393]	valid_0's auc: 0.932694
[394]	valid_0's auc: 0.932605
[395]	valid_0's auc: 0.933409
[396]	valid_0's auc: 0.933349
[397]	valid_0's auc: 0.933081
[398]	valid_0's auc: 0.932992
[399]	valid_0's auc: 0.932813
[400]	valid_0's auc: 0.933707
[401]	valid_0's auc: 0.933558
[402]	valid_0's auc: 0.933766
[403]	valid_0's auc: 0.933736
[404]	valid_0's auc: 0.933707
[405]	valid_0's auc: 0.933826
[406]	valid_0's auc: 0.933766
[407]	valid_0's auc: 0.933319
[408]	valid_0's auc: 0.933319
[409]	valid_0's auc: 0.933409
[410]	valid_0's auc: 0.933587
[411]	valid_0's auc: 0.933349
[412]	valid_0's auc: 0.933379
[413]	valid_0's auc: 0.933409
[414]	valid_

[474]	valid_0's auc: 0.932277
[475]	valid_0's auc: 0.932902
[476]	valid_0's auc: 0.932902
[477]	valid_0's auc: 0.932575
[478]	valid_0's auc: 0.932396
[479]	valid_0's auc: 0.932426
[480]	valid_0's auc: 0.932188
[481]	valid_0's auc: 0.932366
[482]	valid_0's auc: 0.932396
[483]	valid_0's auc: 0.932902
[484]	valid_0's auc: 0.932575
[485]	valid_0's auc: 0.932634
[486]	valid_0's auc: 0.932724
[487]	valid_0's auc: 0.933022
[488]	valid_0's auc: 0.932902
[489]	valid_0's auc: 0.9332
[490]	valid_0's auc: 0.9332
[491]	valid_0's auc: 0.9332
[492]	valid_0's auc: 0.93326
[493]	valid_0's auc: 0.93323
[494]	valid_0's auc: 0.93326
[495]	valid_0's auc: 0.933349
[496]	valid_0's auc: 0.93329
[497]	valid_0's auc: 0.933498
[498]	valid_0's auc: 0.933409
[499]	valid_0's auc: 0.933379
[500]	valid_0's auc: 0.933349
[501]	valid_0's auc: 0.93329
[502]	valid_0's auc: 0.933707
[503]	valid_0's auc: 0.933766
[504]	valid_0's auc: 0.933766
[505]	valid_0's auc: 0.933826
[506]	valid_0's auc: 0.934034
[507]	valid_0's auc: 

[563]	valid_0's auc: 0.934064
[564]	valid_0's auc: 0.934094
[565]	valid_0's auc: 0.933915
[566]	valid_0's auc: 0.933677
[567]	valid_0's auc: 0.9332
[568]	valid_0's auc: 0.933141
[569]	valid_0's auc: 0.933111
[570]	valid_0's auc: 0.933171
[571]	valid_0's auc: 0.932783
[572]	valid_0's auc: 0.9332
[573]	valid_0's auc: 0.933081
[574]	valid_0's auc: 0.933081
[575]	valid_0's auc: 0.932932
[576]	valid_0's auc: 0.932813
[577]	valid_0's auc: 0.932813
[578]	valid_0's auc: 0.932813
[579]	valid_0's auc: 0.932813
[580]	valid_0's auc: 0.932843
[581]	valid_0's auc: 0.932843
[582]	valid_0's auc: 0.932813
[583]	valid_0's auc: 0.932456
[584]	valid_0's auc: 0.932515
[585]	valid_0's auc: 0.932337
[586]	valid_0's auc: 0.932366
[587]	valid_0's auc: 0.932575
[588]	valid_0's auc: 0.932456
[589]	valid_0's auc: 0.932456
[590]	valid_0's auc: 0.932426
[591]	valid_0's auc: 0.932486
[592]	valid_0's auc: 0.932456
[593]	valid_0's auc: 0.932813
[594]	valid_0's auc: 0.932902
[595]	valid_0's auc: 0.932873
[596]	valid_0'

[662]	valid_0's auc: 0.931652
[663]	valid_0's auc: 0.931741
[664]	valid_0's auc: 0.931681
[665]	valid_0's auc: 0.931741
[666]	valid_0's auc: 0.931652
[667]	valid_0's auc: 0.931681
[668]	valid_0's auc: 0.931681
[669]	valid_0's auc: 0.931473
[670]	valid_0's auc: 0.931503
[671]	valid_0's auc: 0.931473
[672]	valid_0's auc: 0.931384
[673]	valid_0's auc: 0.931294
[674]	valid_0's auc: 0.931175
[675]	valid_0's auc: 0.931175
[676]	valid_0's auc: 0.931086
[677]	valid_0's auc: 0.930907
[678]	valid_0's auc: 0.930877
[679]	valid_0's auc: 0.930699
[680]	valid_0's auc: 0.93046
[681]	valid_0's auc: 0.93049
[682]	valid_0's auc: 0.93055
[683]	valid_0's auc: 0.930639
[684]	valid_0's auc: 0.930877
[685]	valid_0's auc: 0.930848
[686]	valid_0's auc: 0.931354
[687]	valid_0's auc: 0.931413
[688]	valid_0's auc: 0.931384
[689]	valid_0's auc: 0.931562
[690]	valid_0's auc: 0.931711
[691]	valid_0's auc: 0.931771
[692]	valid_0's auc: 0.931681
[693]	valid_0's auc: 0.931949
[694]	valid_0's auc: 0.931533
[695]	valid_0

[764]	valid_0's auc: 0.935702
[765]	valid_0's auc: 0.935702
[766]	valid_0's auc: 0.935762
[767]	valid_0's auc: 0.935762
[768]	valid_0's auc: 0.935702
[769]	valid_0's auc: 0.935583
[770]	valid_0's auc: 0.935583
[771]	valid_0's auc: 0.935523
[772]	valid_0's auc: 0.935553
[773]	valid_0's auc: 0.935523
[774]	valid_0's auc: 0.935553
[775]	valid_0's auc: 0.935553
[776]	valid_0's auc: 0.935613
[777]	valid_0's auc: 0.935613
[778]	valid_0's auc: 0.935702
[779]	valid_0's auc: 0.935642
[780]	valid_0's auc: 0.935642
[781]	valid_0's auc: 0.935732
[782]	valid_0's auc: 0.935523
[783]	valid_0's auc: 0.935583
[784]	valid_0's auc: 0.935613
[785]	valid_0's auc: 0.935613
[786]	valid_0's auc: 0.935642
[787]	valid_0's auc: 0.935613
[788]	valid_0's auc: 0.935642
[789]	valid_0's auc: 0.935404
[790]	valid_0's auc: 0.935762
[791]	valid_0's auc: 0.935613
[792]	valid_0's auc: 0.935732
[793]	valid_0's auc: 0.935672
[794]	valid_0's auc: 0.935642
[795]	valid_0's auc: 0.93594
[796]	valid_0's auc: 0.935851
[797]	valid

[864]	valid_0's auc: 0.936506
[865]	valid_0's auc: 0.936506
[866]	valid_0's auc: 0.936953
[867]	valid_0's auc: 0.937042
[868]	valid_0's auc: 0.937072
[869]	valid_0's auc: 0.936804
[870]	valid_0's auc: 0.936834
[871]	valid_0's auc: 0.936863
[872]	valid_0's auc: 0.936953
[873]	valid_0's auc: 0.937072
[874]	valid_0's auc: 0.936953
[875]	valid_0's auc: 0.936953
[876]	valid_0's auc: 0.937191
[877]	valid_0's auc: 0.937102
[878]	valid_0's auc: 0.937072
[879]	valid_0's auc: 0.937042
[880]	valid_0's auc: 0.937072
[881]	valid_0's auc: 0.937042
[882]	valid_0's auc: 0.937012
[883]	valid_0's auc: 0.937012
[884]	valid_0's auc: 0.936953
[885]	valid_0's auc: 0.937102
[886]	valid_0's auc: 0.93737
[887]	valid_0's auc: 0.937191
[888]	valid_0's auc: 0.937459
[889]	valid_0's auc: 0.937429
[890]	valid_0's auc: 0.937399
[891]	valid_0's auc: 0.937251
[892]	valid_0's auc: 0.937072
[893]	valid_0's auc: 0.937131
[894]	valid_0's auc: 0.93734
[895]	valid_0's auc: 0.93731
[896]	valid_0's auc: 0.93731
[897]	valid_0'

[963]	valid_0's auc: 0.937876
[964]	valid_0's auc: 0.937787
[965]	valid_0's auc: 0.937846
[966]	valid_0's auc: 0.937906
[967]	valid_0's auc: 0.937906
[968]	valid_0's auc: 0.937936
[969]	valid_0's auc: 0.937965
[970]	valid_0's auc: 0.937995
[971]	valid_0's auc: 0.938025
[972]	valid_0's auc: 0.938084
[973]	valid_0's auc: 0.938204
[974]	valid_0's auc: 0.938084
[975]	valid_0's auc: 0.938025
[976]	valid_0's auc: 0.937965
[977]	valid_0's auc: 0.937995
[978]	valid_0's auc: 0.938144
[979]	valid_0's auc: 0.938084
[980]	valid_0's auc: 0.937668
[981]	valid_0's auc: 0.937668
[982]	valid_0's auc: 0.937697
[983]	valid_0's auc: 0.937757
[984]	valid_0's auc: 0.937608
[985]	valid_0's auc: 0.937668
[986]	valid_0's auc: 0.937697
[987]	valid_0's auc: 0.937697
[988]	valid_0's auc: 0.937787
[989]	valid_0's auc: 0.937757
[990]	valid_0's auc: 0.937816
[991]	valid_0's auc: 0.937638
[992]	valid_0's auc: 0.937519
[993]	valid_0's auc: 0.937519
[994]	valid_0's auc: 0.937548
[995]	valid_0's auc: 0.937578
[996]	vali

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_y['Pred'] = validation_y['Pred'].apply(lambda x: round(x))


In [12]:
model = trainModel(trainingSet)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Index(['emojiCount', 'followers', 'following', 'hashtagCount', 'urlCount',
       'tweetCount', 'verified'],
      dtype='object')
You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.940225
[2]	valid_0's auc: 0.982251
[3]	valid_0's auc: 0.977149
[4]	valid_0's auc: 0.974554
[5]	valid_0's auc: 0.985791
[6]	valid_0's auc: 0.988807
[7]	valid_0's auc: 0.987997
[8]	valid_0's auc: 0.986047
[9]	valid_0's auc: 0.989107
[10]	valid_0's auc: 0.987847
[11]	valid_0's auc: 0.989167
[12]	valid_0's auc: 0.988177
[13]	valid_0's auc: 0.989467
[14]	valid_0's auc: 0.988987
[15]	valid_0's auc: 0.988177
[16]	valid_0's auc: 0.987277
[17]	valid_0's auc: 0.988567
[18]	valid_0's auc: 0.988297
[19]	valid_0's auc: 0.987637
[20]	valid_0's auc: 0.987517
[21]	valid_0's auc: 0.987427
[22]	valid_0's auc: 0.988837
[23]	valid_0's auc: 0.989017
[24]	valid_0's auc: 0.989977
[25]	valid_0's auc: 0.990068
[26]	valid_0's auc: 0.990038
[27]	valid_0's auc: 0.989887
[28]	valid_0's auc: 0.990248
[29]	val

[113]	valid_0's auc: 0.991898
[114]	valid_0's auc: 0.992198
[115]	valid_0's auc: 0.992138
[116]	valid_0's auc: 0.991838
[117]	valid_0's auc: 0.991748
[118]	valid_0's auc: 0.991658
[119]	valid_0's auc: 0.991688
[120]	valid_0's auc: 0.991718
[121]	valid_0's auc: 0.991718
[122]	valid_0's auc: 0.991718
[123]	valid_0's auc: 0.991658
[124]	valid_0's auc: 0.991748
[125]	valid_0's auc: 0.991658
[126]	valid_0's auc: 0.991658
[127]	valid_0's auc: 0.991568
[128]	valid_0's auc: 0.991568
[129]	valid_0's auc: 0.991568
[130]	valid_0's auc: 0.991568
[131]	valid_0's auc: 0.991628
[132]	valid_0's auc: 0.991448
[133]	valid_0's auc: 0.991358
[134]	valid_0's auc: 0.991418
[135]	valid_0's auc: 0.991448
[136]	valid_0's auc: 0.991418
[137]	valid_0's auc: 0.991478
[138]	valid_0's auc: 0.991118
[139]	valid_0's auc: 0.990968
[140]	valid_0's auc: 0.990968
[141]	valid_0's auc: 0.990998
[142]	valid_0's auc: 0.991028
[143]	valid_0's auc: 0.991418
[144]	valid_0's auc: 0.991358
[145]	valid_0's auc: 0.991328
[146]	vali

[204]	valid_0's auc: 0.991118
[205]	valid_0's auc: 0.991088
[206]	valid_0's auc: 0.991118
[207]	valid_0's auc: 0.991118
[208]	valid_0's auc: 0.991148
[209]	valid_0's auc: 0.991148
[210]	valid_0's auc: 0.991148
[211]	valid_0's auc: 0.991178
[212]	valid_0's auc: 0.991028
[213]	valid_0's auc: 0.990968
[214]	valid_0's auc: 0.990818
[215]	valid_0's auc: 0.990938
[216]	valid_0's auc: 0.990968
[217]	valid_0's auc: 0.990758
[218]	valid_0's auc: 0.990548
[219]	valid_0's auc: 0.990548
[220]	valid_0's auc: 0.990248
[221]	valid_0's auc: 0.990308
[222]	valid_0's auc: 0.990068
[223]	valid_0's auc: 0.990218
[224]	valid_0's auc: 0.990128
[225]	valid_0's auc: 0.990098
[226]	valid_0's auc: 0.990128
[227]	valid_0's auc: 0.990188
[228]	valid_0's auc: 0.990128
[229]	valid_0's auc: 0.990158
[230]	valid_0's auc: 0.990158
[231]	valid_0's auc: 0.990098
[232]	valid_0's auc: 0.989917
[233]	valid_0's auc: 0.989977
[234]	valid_0's auc: 0.990038
[235]	valid_0's auc: 0.989977
[236]	valid_0's auc: 0.989677
[237]	vali

[344]	valid_0's auc: 0.989317
[345]	valid_0's auc: 0.989437
[346]	valid_0's auc: 0.989317
[347]	valid_0's auc: 0.989347
[348]	valid_0's auc: 0.989587
[349]	valid_0's auc: 0.989497
[350]	valid_0's auc: 0.989557
[351]	valid_0's auc: 0.989617
[352]	valid_0's auc: 0.989497
[353]	valid_0's auc: 0.989797
[354]	valid_0's auc: 0.989797
[355]	valid_0's auc: 0.989797
[356]	valid_0's auc: 0.990038
[357]	valid_0's auc: 0.990038
[358]	valid_0's auc: 0.990008
[359]	valid_0's auc: 0.990008
[360]	valid_0's auc: 0.990158
[361]	valid_0's auc: 0.990098
[362]	valid_0's auc: 0.990218
[363]	valid_0's auc: 0.990218
[364]	valid_0's auc: 0.990218
[365]	valid_0's auc: 0.990308
[366]	valid_0's auc: 0.990278
[367]	valid_0's auc: 0.990278
[368]	valid_0's auc: 0.990308
[369]	valid_0's auc: 0.990368
[370]	valid_0's auc: 0.990368
[371]	valid_0's auc: 0.990368
[372]	valid_0's auc: 0.990428
[373]	valid_0's auc: 0.990488
[374]	valid_0's auc: 0.990548
[375]	valid_0's auc: 0.990458
[376]	valid_0's auc: 0.990458
[377]	vali

[472]	valid_0's auc: 0.990758
[473]	valid_0's auc: 0.990968
[474]	valid_0's auc: 0.990818
[475]	valid_0's auc: 0.990818
[476]	valid_0's auc: 0.990878
[477]	valid_0's auc: 0.990848
[478]	valid_0's auc: 0.990848
[479]	valid_0's auc: 0.990848
[480]	valid_0's auc: 0.990908
[481]	valid_0's auc: 0.990878
[482]	valid_0's auc: 0.990878
[483]	valid_0's auc: 0.990788
[484]	valid_0's auc: 0.990728
[485]	valid_0's auc: 0.990758
[486]	valid_0's auc: 0.990848
[487]	valid_0's auc: 0.990668
[488]	valid_0's auc: 0.990698
[489]	valid_0's auc: 0.990758
[490]	valid_0's auc: 0.990758
[491]	valid_0's auc: 0.990758
[492]	valid_0's auc: 0.990728
[493]	valid_0's auc: 0.990728
[494]	valid_0's auc: 0.990818
[495]	valid_0's auc: 0.990698
[496]	valid_0's auc: 0.990608
[497]	valid_0's auc: 0.990458
[498]	valid_0's auc: 0.990668
[499]	valid_0's auc: 0.990668
[500]	valid_0's auc: 0.990698
[501]	valid_0's auc: 0.990638
[502]	valid_0's auc: 0.990638
[503]	valid_0's auc: 0.990668
[504]	valid_0's auc: 0.990668
[505]	vali

[561]	valid_0's auc: 0.990188
[562]	valid_0's auc: 0.990188
[563]	valid_0's auc: 0.990188
[564]	valid_0's auc: 0.990188
[565]	valid_0's auc: 0.990158
[566]	valid_0's auc: 0.990248
[567]	valid_0's auc: 0.990488
[568]	valid_0's auc: 0.990488
[569]	valid_0's auc: 0.990488
[570]	valid_0's auc: 0.990488
[571]	valid_0's auc: 0.990608
[572]	valid_0's auc: 0.990698
[573]	valid_0's auc: 0.990698
[574]	valid_0's auc: 0.990698
[575]	valid_0's auc: 0.990578
[576]	valid_0's auc: 0.990638
[577]	valid_0's auc: 0.990608
[578]	valid_0's auc: 0.990668
[579]	valid_0's auc: 0.990638
[580]	valid_0's auc: 0.990668
[581]	valid_0's auc: 0.990908
[582]	valid_0's auc: 0.990908
[583]	valid_0's auc: 0.990878
[584]	valid_0's auc: 0.990818
[585]	valid_0's auc: 0.990968
[586]	valid_0's auc: 0.991028
[587]	valid_0's auc: 0.991058
[588]	valid_0's auc: 0.991058
[589]	valid_0's auc: 0.991088
[590]	valid_0's auc: 0.991088
[591]	valid_0's auc: 0.991088
[592]	valid_0's auc: 0.991148
[593]	valid_0's auc: 0.991238
[594]	vali

[650]	valid_0's auc: 0.990608
[651]	valid_0's auc: 0.990788
[652]	valid_0's auc: 0.990758
[653]	valid_0's auc: 0.990788
[654]	valid_0's auc: 0.990698
[655]	valid_0's auc: 0.990608
[656]	valid_0's auc: 0.990698
[657]	valid_0's auc: 0.990698
[658]	valid_0's auc: 0.990548
[659]	valid_0's auc: 0.990578
[660]	valid_0's auc: 0.990638
[661]	valid_0's auc: 0.990638
[662]	valid_0's auc: 0.990458
[663]	valid_0's auc: 0.990458
[664]	valid_0's auc: 0.990488
[665]	valid_0's auc: 0.990488
[666]	valid_0's auc: 0.990488
[667]	valid_0's auc: 0.990488
[668]	valid_0's auc: 0.990518
[669]	valid_0's auc: 0.990518
[670]	valid_0's auc: 0.990518
[671]	valid_0's auc: 0.990488
[672]	valid_0's auc: 0.990638
[673]	valid_0's auc: 0.990638
[674]	valid_0's auc: 0.990668
[675]	valid_0's auc: 0.990758
[676]	valid_0's auc: 0.990788
[677]	valid_0's auc: 0.990608
[678]	valid_0's auc: 0.990638
[679]	valid_0's auc: 0.990548
[680]	valid_0's auc: 0.990518
[681]	valid_0's auc: 0.990518
[682]	valid_0's auc: 0.990518
[683]	vali

[736]	valid_0's auc: 0.990758
[737]	valid_0's auc: 0.990788
[738]	valid_0's auc: 0.990818
[739]	valid_0's auc: 0.990728
[740]	valid_0's auc: 0.990788
[741]	valid_0's auc: 0.990788
[742]	valid_0's auc: 0.990638
[743]	valid_0's auc: 0.990608
[744]	valid_0's auc: 0.990488
[745]	valid_0's auc: 0.990488
[746]	valid_0's auc: 0.990458
[747]	valid_0's auc: 0.990428
[748]	valid_0's auc: 0.990338
[749]	valid_0's auc: 0.990308
[750]	valid_0's auc: 0.990218
[751]	valid_0's auc: 0.990188
[752]	valid_0's auc: 0.990158
[753]	valid_0's auc: 0.990128
[754]	valid_0's auc: 0.990158
[755]	valid_0's auc: 0.990128
[756]	valid_0's auc: 0.990158
[757]	valid_0's auc: 0.990158
[758]	valid_0's auc: 0.990098
[759]	valid_0's auc: 0.990008
[760]	valid_0's auc: 0.990068
[761]	valid_0's auc: 0.990038
[762]	valid_0's auc: 0.989917
[763]	valid_0's auc: 0.989917
[764]	valid_0's auc: 0.989947
[765]	valid_0's auc: 0.989917
[766]	valid_0's auc: 0.989917
[767]	valid_0's auc: 0.989887
[768]	valid_0's auc: 0.989887
[769]	vali

[856]	valid_0's auc: 0.990038
[857]	valid_0's auc: 0.990038
[858]	valid_0's auc: 0.990038
[859]	valid_0's auc: 0.990038
[860]	valid_0's auc: 0.990068
[861]	valid_0's auc: 0.990068
[862]	valid_0's auc: 0.990218
[863]	valid_0's auc: 0.990218
[864]	valid_0's auc: 0.990248
[865]	valid_0's auc: 0.990158
[866]	valid_0's auc: 0.990188
[867]	valid_0's auc: 0.990158
[868]	valid_0's auc: 0.990158
[869]	valid_0's auc: 0.990098
[870]	valid_0's auc: 0.990098
[871]	valid_0's auc: 0.990098
[872]	valid_0's auc: 0.990128
[873]	valid_0's auc: 0.990128
[874]	valid_0's auc: 0.990128
[875]	valid_0's auc: 0.990128
[876]	valid_0's auc: 0.990098
[877]	valid_0's auc: 0.990098
[878]	valid_0's auc: 0.990068
[879]	valid_0's auc: 0.990068
[880]	valid_0's auc: 0.990068
[881]	valid_0's auc: 0.990068
[882]	valid_0's auc: 0.989977
[883]	valid_0's auc: 0.989947
[884]	valid_0's auc: 0.990068
[885]	valid_0's auc: 0.990008
[886]	valid_0's auc: 0.990098
[887]	valid_0's auc: 0.989917
[888]	valid_0's auc: 0.989947
[889]	vali

[960]	valid_0's auc: 0.990218
[961]	valid_0's auc: 0.990188
[962]	valid_0's auc: 0.990278
[963]	valid_0's auc: 0.990188
[964]	valid_0's auc: 0.990218
[965]	valid_0's auc: 0.990188
[966]	valid_0's auc: 0.990158
[967]	valid_0's auc: 0.990188
[968]	valid_0's auc: 0.990158
[969]	valid_0's auc: 0.990158
[970]	valid_0's auc: 0.989977
[971]	valid_0's auc: 0.990038
[972]	valid_0's auc: 0.990068
[973]	valid_0's auc: 0.990128
[974]	valid_0's auc: 0.990098
[975]	valid_0's auc: 0.990128
[976]	valid_0's auc: 0.990008
[977]	valid_0's auc: 0.990008
[978]	valid_0's auc: 0.989977
[979]	valid_0's auc: 0.989977
[980]	valid_0's auc: 0.990008
[981]	valid_0's auc: 0.990038
[982]	valid_0's auc: 0.990008
[983]	valid_0's auc: 0.989917
[984]	valid_0's auc: 0.989887
[985]	valid_0's auc: 0.989887
[986]	valid_0's auc: 0.989827
[987]	valid_0's auc: 0.989827
[988]	valid_0's auc: 0.989857
[989]	valid_0's auc: 0.989857
[990]	valid_0's auc: 0.989857
[991]	valid_0's auc: 0.989977
[992]	valid_0's auc: 0.989887
[993]	vali

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_y['Pred'] = validation_y['Pred'].apply(lambda x: round(x))


In [13]:
# validation set from training to demonstrate how ensamble model works
predictionSet = trainingSet.tail(300)

In [14]:

returnValues = ensambleModel(textModel, model, predictionSet)

100%|████████████████████████████████████████| 300/300 [00:00<00:00, 417.83it/s]


In [35]:
# round predictions into 1 and 0s and get evaluation metrics
roundedReturnValues = [round(x) for x in returnValues]
getMetrics(predictionSet.human.values, roundedReturnValues)
tn, fp, fn, tp = confusion_matrix(predictionSet.human.values, roundedReturnValues).ravel()
print("True Negative: %s \n False Negative: %s \n True Positive: %s \n False Positive: %s" % (tn, fp, fn, tp))

[[187   6]
 [ 12  95]]
Accuracy: 0.94
Precision: 0.9405940594059405
Recall: 0.8878504672897196
True Negative: 187 
 False Negative: 6 
 True Positive: 12 
 False Positive: 95


In [16]:
# save the text model to disk
filename = 'text_model.sav'
pickle.dump(textModel, open(filename, 'wb'))
print("Text Model Pickled")

Text Model Pickled


In [17]:
# save the model to disk
filename = 'standard_model.sav'
pickle.dump(model, open(filename, 'wb'))
print("Standard Model Pickled")

Standard Model Pickled
