Load dataset and partion features into defined feature subsets

In [None]:
import pandas as pd
data = pd.read_csv('fyp_dataset.csv')


account = pd.DataFrame(data = data[['location','account_age','verified','tweets','label']])
tweet_construction = pd.DataFrame(data = data[['sentiment_neg','sentiment_neu','sentiment_pos', 'lexical_diversity','label']])
activity = pd.DataFrame(data = data[['following','followers','likes_given','likes_received','label']])
interactiveness = pd.DataFrame(data = data[['hashtag_count','mentions_count','url_count','retweets_count','label']])
tweet_source = pd.DataFrame(data = data[['Twitter for iPhone','Twitter for Andriod','Twitter for iPad','Twitter for Web Client','Twitter for Websites','Twitter for Web App','Other','label']])

Use gridsearch to select optimal classifier for each feature subset. Considers Random Forest, Logestic Regression, Decision Tree and kNearest Neighbour as potential classifiers.

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

frames = [account,tweet_construction,activity,interactiveness,tweet_source]


counter = 1

for frame in frames:
    
    y = frame.pop('label').values
    X = frame.values

    #split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

    
  
    n_neighbors = [int(x) for x in np.linspace(start = 20, stop = 50, num = 2)]
    leaf_size = [int(x) for x in np.linspace(start = 20, stop = 50, num = 2)]
    weights = ['uniform', 'distance']
    algorithm = ['auto', 'ball_tree','kd_tree','brute']

    # Create the random grid
    random_grid = {'n_neighbors': n_neighbors,
               'leaf_size': leaf_size,
               'weights': weights,
               'algorithm': algorithm}

    knn = KNeighborsClassifier()
    knn_random = RandomizedSearchCV(estimator = knn, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    knn_random.fit(X_train, y_train)
    
    #save best model
    knn_best = knn_random.best_estimator_
    #check best n_estimators value
    
    print('knn done')
    
    current_score = knn_best.score(X_test, y_test)
    current_classifier = knn_best

    
    ###### Random Forest ######
    from sklearn.ensemble import RandomForestClassifier



    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}



    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    
    #save best model
    rf_best = rf_random.best_estimator_
    #check best n_estimators value
    
    print('Random Forest done')
    
    if rf_best.score(X_test, y_test) > current_score:
        current_score = rf_best.score(X_test, y_test)
        current_classifier = rf_best


    ###### Logistic Regression ######
    from sklearn.linear_model import LogisticRegression
    #create a new logistic regression model
    log_reg = LogisticRegression(solver='lbfgs')
    #fit the model to the training data
    #log_reg.fit(X_train, y_train)
    #print('LogReg done')

    C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

    
    # Create the random grid
    random_grid = {'C': C}
    
 
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    log_reg_random = RandomizedSearchCV(estimator = log_reg, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    log_reg_random.fit(X_train, y_train)
    
    #save best model
    log_reg_best = log_reg_random.best_estimator_
    #check best n_estimators value
    
    print('Loegistic Regression done')

    
    if log_reg_best.score(X_test, y_test) > current_score:
        current_score = log_reg_best.score(X_test, y_test)
        current_classifier = log_reg_best


    ###### Decision Tree ######
    from sklearn.tree import DecisionTreeClassifier
    #naive_b = DecisionTreeClassifier()
    #naive_b.fit(X_train, y_train)
    
    criterion = ['gini', 'entropy']
    max_features = ['auto', 'sqrt', 'log2']
    min_samples_split = [int(x) for x in np.linspace(2, 110, num = 20)]
    min_samples_leaf = [int(x) for x in np.linspace(2, 110, num = 20)]

    
    # Create the random grid
    random_grid = {'criterion': criterion,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
    
    
    d_tree = DecisionTreeClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    d_tree_random = RandomizedSearchCV(estimator = d_tree, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    d_tree_random.fit(X_train, y_train)
    
    #save best model
    d_tree_best = d_tree_random.best_estimator_
    #check best n_estimators value
    
    print('Decision tree done')
    
    if d_tree_best.score(X_test, y_test) > current_score:
        current_score = d_tree_best.score(X_test, y_test)
        current_classifier = d_tree_best
    
      
    if counter == 1:
        account_classifier = current_classifier
        account_score = current_score
    elif counter == 2:
        #sentiment_classifier = current_classifier
        #sentiment_score = current_score
        tweet_construction_classifier = current_classifier
        tweet_construction_score = current_score
    elif counter == 3:
        activity_classifier = current_classifier
        activity_score = current_score
        #interactiveness_classifier = current_classifier
        #interactiveness_score = current_score
    elif counter == 4:
        interactiveness_classifier = current_classifier
        interactiveness_score = current_score
    else:
        tweet_source_classifier = current_classifier
        tweet_source_score = current_score
        
    
    print("Iteration %d complete" % counter)
    
    counter += 1

output best performing classifiers for each feature subset

In [None]:
account_classifier

In [None]:
interactiveness_classifier

In [None]:
activity_classifier

In [None]:
tweet_construction_classifier

In [None]:
tweet_source_classifier

In [None]:
print(account_score)
   
print(sentiment_score)
       
print(activity_score)
   
print(interactiveness_score)
    
print(tweet_source_score)

Section that uses feature subset classifiers to create majority voting classifier

In [None]:
import joblib

data = pd.read_csv('fyp_dataset.csv')

data.pop('id').values

y = data.pop('label').values
X = data.values

#split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

account_classifier = joblib.load("joblib//account_classifier.sav")
sentiment_classifier = joblib.load("joblib//sentiment_classifier.sav")
activity_classifier = joblib.load("joblib//activity_classifier.sav")
interactiveness_classifier = joblib.load("joblib//interactiveness_classifier.sav")
tweet_source_classifier = joblib.load("joblib//tweet_source_classifier1.sav")

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of our models
estimators=[('account', account_classifier), ('sentiment', tweet_construction_classifier), ('activity', activity_classifier), ('interactiveness', interactiveness_classifier), ('source', tweet_source_classifier)]
#create our voting classifier, inputting our models
ensemble_updated = VotingClassifier(estimators, voting='soft', weights=[0.149, 0.095, 0.311, 0.275, 0.17]) #all
#ensemble_updated = VotingClassifier(estimators, voting='soft', weights=[0.158, 0.036, 0.332, 0.293, 0.181])# without lex

#ensemble_updated = VotingClassifier(estimators, voting='soft', weights=[0.192, 0.044, 0.403, 0.361])# without lex and tweet sources

In [None]:
#fit model to training data
ensemble_updated.fit(X_train, y_train)
#test our model on the test data
ensemble_updated.score(X_test, y_test)

In [None]:
# save the model to disk
filename = 'joblib//ensemble_classifier101.sav'
joblib.dump(ensemble_updated, filename)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_score, cross_validate

scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'roc_auc': 'roc_auc',
           'f1' : 'f1'}

#frame = pd.read_csv('fyp_dataset.csv')

data = pd.read_csv('fyp_dataset.csv')

data.pop('id').values

y = data.pop('label').values
X = data.values


cross_val_scores = cross_validate(ensemble_updated, X, y, cv=5, scoring = scoring)
cross_val_scores


reports1 = []

for i in range(5):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=1/5)


    fitted_model = ensemble_updated.fit(X_train, y_train)
    y_dash = fitted_model.predict(X_test)
    
    reports1.append(classification_report(y_test, y_dash, target_names = ['Human','Bot']))

In [None]:
print(cross_val_scores['test_accuracy'].mean())
print(cross_val_scores['test_recall'].mean())
print(cross_val_scores['test_precision'].mean())
print(cross_val_scores['test_roc_auc'].mean())
print(cross_val_scores['test_f1'].mean())

In [None]:
for report in reports1:
    print(report)

Save trained classifiers locally as pickle files

In [None]:
import pickle

with open('pickle//with lex div//account_classifier.pkl', 'wb') as fid:
    pickle.dump(account_classifier, fid) 
fid.close()

with open('pickle//with lex div//sentiment_classifier.pkl', 'wb') as fid:
    pickle.dump(tweet_construction_classifier, fid) 
fid.close()

with open('pickle//with lex div//activity_classifier.pkl', 'wb') as fid:
    pickle.dump(activity_classifier, fid) 
fid.close()

with open('pickle//with lex div//interactiveness_classifier.pkl', 'wb') as fid:
    pickle.dump(interactiveness_classifier, fid) 
fid.close()

with open('pickle//with lex div//tweet_source_classifier.pkl', 'wb') as fid:
    pickle.dump(tweet_source_classifier, fid) 
fid.close()

with open('pickle//with lex div//ensemble_classifier.pkl', 'wb') as fid:
    pickle.dump(ensemble_updated, fid) 
fid.close()

Save trained classifiers locally as joblib files

In [None]:
import joblib

# save the model to disk
filename = 'joblib//account_classifier.sav'
joblib.dump(account_classifier, filename)

# save the model to disk
filename = 'joblib//sentiment_classifier.sav'
joblib.dump(tweet_construction_classifier, filename)

# save the model to disk
filename = 'joblib//activity_classifier.sav'
joblib.dump(activity_classifier, filename)

# save the model to disk
filename = 'joblib//interactiveness_classifier.sav'
joblib.dump(interactiveness_classifier, filename)

# save the model to disk
filename = 'joblib//tweet_source_classifier.sav'
joblib.dump(tweet_source_classifier, filename)

# save the model to disk
filename = 'joblib//ensemble_classifier.sav'
joblib.dump(ensemble_updated, filename)