In [2]:
import pandas as pd
import regex as re
from tqdm import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import lightgbm as lgbm
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import pickle
# loading the tensorflow universal sentence encoder 4 model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [3]:
# set dataframe display settings to show 1000 rows
pd.options.display.max_rows = 1000

In [4]:
# prep a data set with a column named 'cleanText' for model prediction
def prepDataForTextModel(predictionData):
    X_train = []
    for r in tqdm(predictionData.cleanText.values):
        emb = embed([r])
        review_emb = tf.reshape(emb, [-1]).numpy()
        X_train.append(review_emb)
    predictionData = np.array(X_train)
    return predictionData

In [5]:
# prep a dataset containing the columns ['emojiCount', 'followers', 'following','hashtagCount', 'urlCount',
# 'tweetCount', 'verified'] which are the required features for prediction
def prepDataForModel(predictionData):
    predictionData = predictionData[['emojiCount', 'followers', 'following', 'hashtagCount', 'urlCount', 'tweetCount', 'verified']]
    return predictionData

In [6]:
# creates trainingSet from human and bot dataset
def munge():
    # read bot and human datasets
    trainHumans = pd.read_csv('trainingDataHuman.csv')
    trainBots = pd.read_csv('trainingDataBots.csv')
    
    # Combine bot and human dataframes to create full trainingSet, then schuffle them
    frames = [trainHumans, trainBots]
    trainingSet = pd.concat(frames)
    trainingSet = trainingSet.sample(frac=1).reset_index(drop=True)
    
    # replacing na values in trainingSet with 'empty'
    trainingSet["cleanText"].fillna("empty", inplace = True)
    # changing true and false to 1 and 0
    trainingSet.verified = trainingSet.verified.replace({True: 1, False: 0})
    
    # embed clean text
    trainingSet['embededText'] = trainingSet.apply(lambda row : tf.reshape(embed([row["cleanText"]]),[-1]).numpy(), axis = 1)
    
    return trainingSet

In [7]:
# gets performance metrics from any set of true and predicted values
def getMetrics(y_true, y_pred):
    print(confusion_matrix(y_true, y_pred))
    print("Accuracy: " + str(accuracy_score(y_true, y_pred)))
    print("Precision: " + str(precision_score(y_true, y_pred)))
    print("Recall: " + str(recall_score(y_true, y_pred)))

In [8]:
# trains a text model given a training dataset
def trainTextModel(trainingSet):
    print(trainingSet.columns)
    X_train = []
    for r in tqdm(trainingSet.cleanText.values):
        emb = embed([r])
        review_emb = tf.reshape(emb, [-1]).numpy()
        X_train.append(review_emb)

    X_train = np.array(X_train)
    y_train = trainingSet.human.values

    validationSet = X_train[1846:]
    validation_y = trainingSet.iloc[1846:]
    X_train = X_train[0:1845]
    y_train = y_train[0:1845]
    
    # split data into train and testing data
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2, random_state = 42)
    
    # get the train and test data for the training sequence
    train_data = lgbm.Dataset(X_train, label=y_train)
    test_data = lgbm.Dataset(X_test, label=y_test)

    # parameters we'll use for the prediction and pass to lightgbm model
    parameters = {
        'metric': 'auc',
        'boosting': 'dart',
        'num_leaves': 31,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 20,
        'learning_rate': 0.05,
        'verbose': 0
    }
    parameters,
    # train the classifier
    classifier = lgbm.train(parameters,
                            train_data,
                            valid_sets= test_data,
                            num_boost_round=1000,
                            early_stopping_rounds=100)
    
    # PREDICTION - Make prediction on validation dataset
    val_pred = classifier.predict(validationSet)

    # Insert predictions into validation dataframe
    validation_y.insert(1, "Pred", val_pred, True)
    
    # Round continuous predictions into true or false (0 or 1)
    validation_y['Pred'] = validation_y['Pred'].apply(lambda x: round(x))
    
    y_true = validation_y['human']
    y_pred = validation_y['Pred']
    
    getMetrics(y_true, y_pred)
    
    return classifier

In [9]:
# trains and returns model which accepts the following features
# 'emojiCount', 'followers', 'following', 'hashtagCount', 'urlCount', 'tweetCount', 'verified'
def trainModel(trainingSet):
    validationSet = trainingSet.tail(300)
    validation_y = trainingSet.tail(300)
    
    validationSet.drop(columns=['id', 'rawText', 'cleanText', 'human', 'embededText'], inplace=True)
    
    X_train = trainingSet.head(len(trainingSet)-300)
    y_train = trainingSet.head(len(trainingSet)-300).human.values
    
    X_train.drop(columns=['id', 'rawText', 'cleanText', 'human', 'embededText'], inplace=True)
    
    print(X_train.columns)
    # split data into train and testing data
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2, random_state = 42)
    
    # get the train and test data for the training sequence
    train_data = lgbm.Dataset(X_train, label=y_train)
    test_data = lgbm.Dataset(X_test, label=y_test)

    # parameters we'll use for the prediction and pass to lightgbm model
    parameters = {
        'metric': 'auc',
        'boosting': 'dart',
        'num_leaves': 31,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 20,
        'learning_rate': 0.05,
        'verbose': 0
    }
    parameters,
    # train the classifier
    classifier = lgbm.train(parameters,
                            train_data,
                            valid_sets= test_data,
                            num_boost_round=1000,
                            early_stopping_rounds=100)
    
    # PREDICTION - Make prediction on validation dataset
    val_pred = classifier.predict(validationSet)

    # Insert predictions into validation dataframe
    validation_y.insert(1, "Pred", val_pred, True)
    
    # Round continuous predictions into true or false (0 or 1)
    validation_y['Pred'] = validation_y['Pred'].apply(lambda x: round(x))
    
    y_true = validation_y['human']
    y_pred = validation_y['Pred']
    
    getMetrics(y_true, y_pred)
    
    return classifier

In [10]:
# get a prediction from the ensamble model which creates a composite pred as a result of the output of the 
# textModel and standard model
def ensambleModel(textModel, model, predictionData):
    preppedTextData = prepDataForTextModel(predictionData)
    textPrediction = textModel.predict(preppedTextData)
    preppedModelData = prepDataForModel(predictionData)
    modelPrediction = model.predict(preppedModelData)
    
    compositePred = []
    
    for index in range(len(textPrediction)):
        compositePred.append(abs((textPrediction[index] + modelPrediction[index])/2))
        
    return compositePred
    

In [11]:
trainingSet = munge()
print(trainingSet)

                id                                            rawText  \
0     3.627460e+08  @DaveAsheville Awwww! Thanks, @DaveAsheville! ...   
1     2.185879e+09  RT @Google_IDN: 📢 Yuk, ikuti kelas pelatihan o...   
2     1.252200e+04  RT @TripIt: You don’t necessarily need a wide-...   
3     7.847614e+08  「うわ、このピアス、めちゃくちゃデザイン良くないっスか」「知らねーけど」「もうすぐ給料入るし...   
4     4.220886e+07  RT @a_dzyj: Anyone notice Ronnie &amp; Georgia...   
...            ...                                                ...   
2141  9.867755e+08  @QcmvCHPPaEyOX69 حاله نفسية 😔 RT @413_Ree: @hy...   
2142  1.057835e+08  大将からダシ出てるわ\n#マツコの知らない世界 うーむ\nうーーむ\nうーーー\n崩壊\nた...   
2143  4.164312e+07  MICA has been recognized for its civic engagem...   
2144  1.167698e+08  Have you seen my new visual to NIGHT RYDER yet...   
2145  8.085715e+17  थ – DEVANAGARI LETTER THA.svg (0925) https://t...   

                                              cleanText  emojiCount  \
0     daveasheville awwww thanks daveasheville i wou

In [12]:
textModel = trainTextModel(trainingSet)

  2%|▏         | 40/2146 [00:00<00:05, 395.93it/s]

Index(['id', 'rawText', 'cleanText', 'emojiCount', 'followers', 'following',
       'hashtagCount', 'urlCount', 'tweetCount', 'verified', 'human',
       'embededText'],
      dtype='object')


100%|██████████| 2146/2146 [00:04<00:00, 449.01it/s]


[1]	valid_0's auc: 0.860022
[2]	valid_0's auc: 0.892535
[3]	valid_0's auc: 0.896983
[4]	valid_0's auc: 0.909662
[5]	valid_0's auc: 0.921432
[6]	valid_0's auc: 0.922113
[7]	valid_0's auc: 0.924504
[8]	valid_0's auc: 0.926471
[9]	valid_0's auc: 0.927167
[10]	valid_0's auc: 0.929194
[11]	valid_0's auc: 0.927863
[12]	valid_0's auc: 0.927136
[13]	valid_0's auc: 0.925865
[14]	valid_0's auc: 0.925442
[15]	valid_0's auc: 0.926319
[16]	valid_0's auc: 0.924413
[17]	valid_0's auc: 0.925381
[18]	valid_0's auc: 0.924927
[19]	valid_0's auc: 0.924322
[20]	valid_0's auc: 0.923535
[21]	valid_0's auc: 0.924564
[22]	valid_0's auc: 0.925351
[23]	valid_0's auc: 0.92641
[24]	valid_0's auc: 0.927106
[25]	valid_0's auc: 0.927469
[26]	valid_0's auc: 0.92635
[27]	valid_0's auc: 0.928226
[28]	valid_0's auc: 0.928437
[29]	valid_0's auc: 0.928377
[30]	valid_0's auc: 0.929012
[31]	valid_0's auc: 0.928891
[32]	valid_0's auc: 0.930677
[33]	valid_0's auc: 0.929436
[34]	valid_0's auc: 0.929648
[35]	valid_0's auc: 0.930

[301]	valid_0's auc: 0.945292
[302]	valid_0's auc: 0.945806
[303]	valid_0's auc: 0.945776
[304]	valid_0's auc: 0.945715
[305]	valid_0's auc: 0.945594
[306]	valid_0's auc: 0.945534
[307]	valid_0's auc: 0.945564
[308]	valid_0's auc: 0.945231
[309]	valid_0's auc: 0.945201
[310]	valid_0's auc: 0.945564
[311]	valid_0's auc: 0.944868
[312]	valid_0's auc: 0.945019
[313]	valid_0's auc: 0.945292
[314]	valid_0's auc: 0.945231
[315]	valid_0's auc: 0.945292
[316]	valid_0's auc: 0.945655
[317]	valid_0's auc: 0.94505
[318]	valid_0's auc: 0.944959
[319]	valid_0's auc: 0.944868
[320]	valid_0's auc: 0.944717
[321]	valid_0's auc: 0.944717
[322]	valid_0's auc: 0.945382
[323]	valid_0's auc: 0.945867
[324]	valid_0's auc: 0.945927
[325]	valid_0's auc: 0.945867
[326]	valid_0's auc: 0.945806
[327]	valid_0's auc: 0.94511
[328]	valid_0's auc: 0.94511
[329]	valid_0's auc: 0.945504
[330]	valid_0's auc: 0.945473
[331]	valid_0's auc: 0.945443
[332]	valid_0's auc: 0.945413
[333]	valid_0's auc: 0.945443
[334]	valid_0

[594]	valid_0's auc: 0.949014
[595]	valid_0's auc: 0.948923
[596]	valid_0's auc: 0.948862
[597]	valid_0's auc: 0.948802
[598]	valid_0's auc: 0.948741
[599]	valid_0's auc: 0.948862
[600]	valid_0's auc: 0.948923
[601]	valid_0's auc: 0.949377
[602]	valid_0's auc: 0.949346
[603]	valid_0's auc: 0.949528
[604]	valid_0's auc: 0.949619
[605]	valid_0's auc: 0.949558
[606]	valid_0's auc: 0.949558
[607]	valid_0's auc: 0.949558
[608]	valid_0's auc: 0.949225
[609]	valid_0's auc: 0.949407
[610]	valid_0's auc: 0.949498
[611]	valid_0's auc: 0.949346
[612]	valid_0's auc: 0.949558
[613]	valid_0's auc: 0.949558
[614]	valid_0's auc: 0.949225
[615]	valid_0's auc: 0.949256
[616]	valid_0's auc: 0.949256
[617]	valid_0's auc: 0.949316
[618]	valid_0's auc: 0.949437
[619]	valid_0's auc: 0.949528
[620]	valid_0's auc: 0.949588
[621]	valid_0's auc: 0.949679
[622]	valid_0's auc: 0.949649
[623]	valid_0's auc: 0.949407
[624]	valid_0's auc: 0.949437
[625]	valid_0's auc: 0.949407
[626]	valid_0's auc: 0.949437
[627]	vali

[880]	valid_0's auc: 0.951979
[881]	valid_0's auc: 0.951918
[882]	valid_0's auc: 0.95216
[883]	valid_0's auc: 0.95216
[884]	valid_0's auc: 0.952009
[885]	valid_0's auc: 0.951888
[886]	valid_0's auc: 0.95207
[887]	valid_0's auc: 0.951828
[888]	valid_0's auc: 0.951646
[889]	valid_0's auc: 0.951737
[890]	valid_0's auc: 0.951828
[891]	valid_0's auc: 0.951949
[892]	valid_0's auc: 0.951737
[893]	valid_0's auc: 0.951737
[894]	valid_0's auc: 0.951828
[895]	valid_0's auc: 0.951797
[896]	valid_0's auc: 0.951797
[897]	valid_0's auc: 0.951828
[898]	valid_0's auc: 0.951828
[899]	valid_0's auc: 0.951737
[900]	valid_0's auc: 0.951525
[901]	valid_0's auc: 0.951797
[902]	valid_0's auc: 0.951737
[903]	valid_0's auc: 0.951767
[904]	valid_0's auc: 0.952009
[905]	valid_0's auc: 0.951949
[906]	valid_0's auc: 0.951979
[907]	valid_0's auc: 0.951979
[908]	valid_0's auc: 0.952039
[909]	valid_0's auc: 0.952009
[910]	valid_0's auc: 0.951949
[911]	valid_0's auc: 0.951676
[912]	valid_0's auc: 0.951737
[913]	valid_0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
model = trainModel(trainingSet)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Index(['emojiCount', 'followers', 'following', 'hashtagCount', 'urlCount',
       'tweetCount', 'verified'],
      dtype='object')
[1]	valid_0's auc: 0.954838
[2]	valid_0's auc: 0.974861
[3]	valid_0's auc: 0.978317
[4]	valid_0's auc: 0.97631
[5]	valid_0's auc: 0.974665
[6]	valid_0's auc: 0.977773
[7]	valid_0's auc: 0.978815
[8]	valid_0's auc: 0.982496
[9]	valid_0's auc: 0.984156
[10]	valid_0's auc: 0.984126
[11]	valid_0's auc: 0.983704
[12]	valid_0's auc: 0.984639
[13]	valid_0's auc: 0.984458
[14]	valid_0's auc: 0.984066
[15]	valid_0's auc: 0.984669
[16]	valid_0's auc: 0.983854
[17]	valid_0's auc: 0.983221
[18]	valid_0's auc: 0.984277
[19]	valid_0's auc: 0.984428
[20]	valid_0's auc: 0.984066
[21]	valid_0's auc: 0.984669
[22]	valid_0's auc: 0.984971
[23]	valid_0's auc: 0.985122
[24]	valid_0's auc: 0.985303
[25]	valid_0's auc: 0.985363
[26]	valid_0's auc: 0.985273
[27]	valid_0's auc: 0.986118
[28]	valid_0's auc: 0.986178
[29]	valid_0's auc: 0.986178
[30]	valid_0's auc: 0.985937
[31]	vali

[346]	valid_0's auc: 0.986118
[347]	valid_0's auc: 0.986118
[348]	valid_0's auc: 0.986088
[349]	valid_0's auc: 0.985937
[350]	valid_0's auc: 0.986057
[351]	valid_0's auc: 0.985575
[352]	valid_0's auc: 0.985756
[353]	valid_0's auc: 0.985635
[354]	valid_0's auc: 0.985605
[355]	valid_0's auc: 0.985605
[356]	valid_0's auc: 0.985725
[357]	valid_0's auc: 0.985725
[358]	valid_0's auc: 0.985333
[359]	valid_0's auc: 0.985333
[360]	valid_0's auc: 0.985031
[361]	valid_0's auc: 0.985725
[362]	valid_0's auc: 0.985514
[363]	valid_0's auc: 0.985514
[364]	valid_0's auc: 0.985484
[365]	valid_0's auc: 0.985725
[366]	valid_0's auc: 0.985786
[367]	valid_0's auc: 0.985756
[368]	valid_0's auc: 0.985816
[369]	valid_0's auc: 0.985816
[370]	valid_0's auc: 0.986118
[371]	valid_0's auc: 0.986118
[372]	valid_0's auc: 0.986027
[373]	valid_0's auc: 0.986389
[374]	valid_0's auc: 0.98645
[375]	valid_0's auc: 0.986389
[376]	valid_0's auc: 0.986389
[377]	valid_0's auc: 0.98648
[378]	valid_0's auc: 0.98645
[379]	valid_0

[622]	valid_0's auc: 0.982889
[623]	valid_0's auc: 0.982647
[624]	valid_0's auc: 0.982738
[625]	valid_0's auc: 0.983372
[626]	valid_0's auc: 0.983734
[627]	valid_0's auc: 0.984488
[628]	valid_0's auc: 0.984699
[629]	valid_0's auc: 0.984699
[630]	valid_0's auc: 0.984639
[631]	valid_0's auc: 0.98473
[632]	valid_0's auc: 0.984941
[633]	valid_0's auc: 0.984941
[634]	valid_0's auc: 0.984911
[635]	valid_0's auc: 0.984971
[636]	valid_0's auc: 0.984941
[637]	valid_0's auc: 0.984639
[638]	valid_0's auc: 0.984699
[639]	valid_0's auc: 0.984639
[640]	valid_0's auc: 0.98476
[641]	valid_0's auc: 0.98476
[642]	valid_0's auc: 0.98479
[643]	valid_0's auc: 0.984428
[644]	valid_0's auc: 0.984367
[645]	valid_0's auc: 0.984277
[646]	valid_0's auc: 0.983764
[647]	valid_0's auc: 0.983643
[648]	valid_0's auc: 0.982949
[649]	valid_0's auc: 0.982889
[650]	valid_0's auc: 0.982828
[651]	valid_0's auc: 0.98319
[652]	valid_0's auc: 0.98319
[653]	valid_0's auc: 0.983221
[654]	valid_0's auc: 0.98319
[655]	valid_0's a

[937]	valid_0's auc: 0.985243
[938]	valid_0's auc: 0.985273
[939]	valid_0's auc: 0.985273
[940]	valid_0's auc: 0.98482
[941]	valid_0's auc: 0.98482
[942]	valid_0's auc: 0.984549
[943]	valid_0's auc: 0.984549
[944]	valid_0's auc: 0.984639
[945]	valid_0's auc: 0.984669
[946]	valid_0's auc: 0.984669
[947]	valid_0's auc: 0.984549
[948]	valid_0's auc: 0.984488
[949]	valid_0's auc: 0.984428
[950]	valid_0's auc: 0.984398
[951]	valid_0's auc: 0.984458
[952]	valid_0's auc: 0.984428
[953]	valid_0's auc: 0.984458
[954]	valid_0's auc: 0.984518
[955]	valid_0's auc: 0.984367
[956]	valid_0's auc: 0.984398
[957]	valid_0's auc: 0.984488
[958]	valid_0's auc: 0.984549
[959]	valid_0's auc: 0.984609
[960]	valid_0's auc: 0.984609
[961]	valid_0's auc: 0.984609
[962]	valid_0's auc: 0.984609
[963]	valid_0's auc: 0.984549
[964]	valid_0's auc: 0.984549
[965]	valid_0's auc: 0.984609
[966]	valid_0's auc: 0.984669
[967]	valid_0's auc: 0.984669
[968]	valid_0's auc: 0.984699
[969]	valid_0's auc: 0.984669
[970]	valid_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
# validation set from training to demonstrate how ensamble model works
predictionSet = trainingSet.tail(300)

In [15]:

returnValues = ensambleModel(textModel, model, predictionSet)

100%|██████████| 300/300 [00:00<00:00, 443.97it/s]


In [16]:
# round predictions into 1 and 0s and get evaluation metrics
roundedReturnValues = [round(x) for x in returnValues]
getMetrics(predictionSet.human.values, roundedReturnValues)
tn, fp, fn, tp = confusion_matrix(predictionSet.human.values, roundedReturnValues).ravel()
print("True Negative: %s \n False Negative: %s \n True Positive: %s \n False Positive: %s" % (tn, fp, fn, tp))

[[173  12]
 [  7 108]]
Accuracy: 0.9366666666666666
Precision: 0.9
Recall: 0.9391304347826087
True Negative: 173 
 False Negative: 12 
 True Positive: 7 
 False Positive: 108


In [17]:
# save the text model to disk
filename = 'text_model.sav'
pickle.dump(textModel, open(filename, 'wb'))
print("Text Model Pickled")

Text Model Pickled


In [18]:
# save the model to disk
filename = 'standard_model.sav'
pickle.dump(model, open(filename, 'wb'))
print("Standard Model Pickled")

Standard Model Pickled
