In [1]:
import sys
import os

# Append the directory of clean_cresci_2015.py to sys.path
sys.path.append(os.path.abspath("../Code"))

# Main libraries
from import_data import ImportData
from evaluation import Evaluate 
from feature_selection import FeatureSelection
from models_test import ModelTester

## MODELS PARAMETERS

The following parametres is the default value of each model, parametres can be optimized before a prediction using Cross Validation  
Otherwise the parametres can be modified here and tested on each model.

In [2]:
models_parametres = {
    'decision_tree': {
        'ccp_alpha': 0.0,
        'class_weight': None,
        'criterion': 'entropy',
        'max_depth': 20,
        'max_features': None,
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'min_samples_leaf': 1,
        'min_samples_split': 10,
        'min_weight_fraction_leaf': 0.0,
        'random_state': None,
        'splitter': 'random'
        },

    'knn': {
        'algorithm': 'auto',
        'leaf_size': 10,
        'metric': 'minkowski',
        'metric_params': None,
        'n_jobs': None,
        'n_neighbors': 3,
        'p': 1,
        'weights': 'uniform'
        },

    'logistic_regression': {
        'C': 0.001,
        'class_weight': None,
        'dual': False,
        'fit_intercept': True,
        'intercept_scaling': 1,
        'l1_ratio': None,
        'max_iter': 50,
        'multi_class': 'auto',
        'n_jobs': None,
        'penalty': 'l2',
        'random_state': None,
        'solver': 'newton-cg',
        'tol': 0.0001,
        'verbose': 0,
        'warm_start': False
        },

 'svm': {
    'C': 1000,
    'break_ties': False,
    'cache_size': 200,
    'class_weight': 'balanced',
    'coef0': 0.1,
    'decision_function_shape': 'ovr',
    'degree': 2,
    'gamma': 'scale',
    'kernel': 'poly',
    'max_iter': -1,
    'probability': True,
    'random_state': None,
    'shrinking': True,
    'tol': 0.001,
    'verbose': False
  }
  }

## Models Test

In [4]:
import pandas as pd

def save_results(
    model_parametres,
    test_metrics,
    val_metrics,
    DATASET,
    BOT_FOLDERS,
    BOT_RATIO,
    MERGED_DATASET,
    TYPE_SELECTION,
    TRAIN_RATE,
    TEST_RATE,
    VAL_RATE,
    MODEL,
    FEATURES
):
    # Expand the dictionaries with appropriate prefixes
    data = {
        "DATASET": DATASET,
        "BOT_FOLDERS": str(BOT_FOLDERS),
        "BOT_RATIO": str(BOT_RATIO),
        "MERGED_DATASET": MERGED_DATASET,
        "TYPE_SELECTION": TYPE_SELECTION,
        "TRAIN_RATE": TRAIN_RATE,
        "TEST_RATE": TEST_RATE,
        "VAL_RATE": VAL_RATE,
        "MODEL": MODEL,
        "FEATURES": FEATURES,
        **{f"test_{k}": v for k, v in test_metrics.items()},
        **{f"val_{k}": v for k, v in val_metrics.items()},
        **model_parametres
    }

    # Convert dictionary to DataFrame
    df = pd.DataFrame([data])
    csv_file_name = f"{MODEL}_results.csv"

    # Check if the CSV file already exists
    if os.path.exists(csv_file_name):
        # Load existing data
        existing_df = pd.read_csv(csv_file_name)

        # Concatenate new data with old data
        updated_df = pd.concat([existing_df, df], ignore_index=True)

        # Drop duplicates
        updated_df.drop_duplicates(keep='first', inplace=True)

        # Save the updated DataFrame to CSV
        updated_df.to_csv(f"../Outputs/{csv_file_name}", index=False)
    else:
        # If the file does not exist, save the DataFrame as new file
        df.to_csv(f"../Outputs/{csv_file_name}", index=False)

    return df

In [7]:
# Main features to choose
DATASET = 'cresci-2017'
BOT_FOLDERS = [1, 1, 1] # Might be different between cresci_2015 and cresci_2017
BOT_RATIO = [.35, .65] # Non-bot to Bot
MERGED_DATASET = True # Merged dataset uses user info plus tweets 
TYPE_SELECTION = "correlation"
TRAIN_RATE = .7
TEST_RATE = .15
VAL_RATE = .15
MODEL = 'all'
FEATURES = None # none equals to test all features, otherwise enter a number of features
MODEL_P = None # Only use the template provided on top for modifying parametres for test
GRID_SEARCH = False

In [8]:
# Import the data 
importer = ImportData()
data = importer.read_and_sample_data(dataset = DATASET,
                                     type_data_merged = MERGED_DATASET, 
                                     bot_ratio= BOT_RATIO, 
                                     bot_fldr_ratio= BOT_FOLDERS
                                     )

# Do a selection of features 
selection = FeatureSelection(data)
list_features = selection.select_features(type_selection = TYPE_SELECTION)

# Create the splits 
SPLIT_RATES = [TRAIN_RATE, TEST_RATE, VAL_RATE] 
splits = importer.split_dataset(data = data, 
                                proportions= SPLIT_RATES
                                )

# Test Model 
test_enviroment = ModelTester(splits, list_features)


############################################## ALL MODELS PREDICTION ##################################################################
if MODEL =='all':
    # Multiple models results DF 
    results = {}

    # Loop across all models
    for model in test_enviroment.models.keys():
        # Generate the predictions
        predictions = {}

        # check if model parametres need to be changed
        if MODEL_P != None:
            test_enviroment.change_model_parameters(model_name=model, 
                                                    new_params=models_parametres[model]
                                                    )

        # check if model parametres need to be optimized before prediction
        if (MODEL_P == None) and (GRID_SEARCH == True):
            test_enviroment.grid_search(model_name = model, 
                                        num_features = FEATURES)

        # Get the current parametres for prediction
        model_parametres = test_enviroment.models[model].get_params()
        
        # Create a prediction    
        predictions = test_enviroment.predict_model(model_name = model, 
                                                    num_features= FEATURES
                                                    )

        # Evaluate the predictions for Test Dataset
        val_evaluation = Evaluate(true_values=splits['y_val'], 
                           predicted_values= predictions['val_predictions'], 
                           predicted_probabilities= predictions['val_probabilities']
                           )
        val_metrics = val_evaluation.get_all_metrics()

        # Evaluate the predictions for Test Dataset
        test_evaluation = Evaluate(true_values=splits['y_test'], 
                           predicted_values= predictions['test_predictions'], 
                           predicted_probabilities= predictions['test_probabilities']
                           )
        test_metrics = test_evaluation.get_all_metrics()

        print(model)

        # Save the result
        results[model] = save_results(
            model_parametres=model_parametres,
            test_metrics=test_metrics,  # Ensure the correct variable name is used
            val_metrics=val_metrics,  # Ensure the correct variable name is used
            DATASET=DATASET,
            BOT_FOLDERS=BOT_FOLDERS,
            BOT_RATIO=BOT_RATIO,
            MERGED_DATASET=MERGED_DATASET,
            TYPE_SELECTION=TYPE_SELECTION,
            TRAIN_RATE=TRAIN_RATE,
            TEST_RATE=TEST_RATE,
            VAL_RATE=VAL_RATE,
            MODEL=model,
            FEATURES=FEATURES
        )
    
######################################### SINGLE MODEL PREDICTION ########################################################################
else:
    # Generate the predictions
    predictions = {}

    # check if model parametres need to be changed
    if MODEL_P != None:
        test_enviroment.change_model_parameters(model_name=MODEL, 
                                                new_params=models_parametres[MODEL]
                                                )

    # check if model parametres need to be optimized before prediction
    if (MODEL_P == None) and (GRID_SEARCH == True):
        test_enviroment.grid_search(model_name = MODEL, 
                                    num_features = FEATURES)
        
    # Get the current parametres for prediction
    model_parametres = test_enviroment.models[MODEL].get_params()

    # Generate a prediction
    predictions = test_enviroment.predict_model(model_name = MODEL, 
                                                num_features= FEATURES)

    # Evaluate the predictions for Test Dataset
    val_evaluation = Evaluate(true_values=splits['y_val'], 
                        predicted_values= predictions['val_predictions'], 
                        predicted_probabilities= predictions['val_probabilities']
                        )
    val_metrics = val_evaluation.get_all_metrics()

    # Evaluate the predictions for Test Dataset
    test_evaluation = Evaluate(true_values=splits['y_test'], 
                        predicted_values= predictions['test_predictions'], 
                        predicted_probabilities= predictions['test_probabilities']
                        )
    test_metrics = test_evaluation.get_all_metrics()
    
    # Save the result
    df = save_results(
        model_parametres=model_parametres,
        test_metrics=test_metrics,  # Ensure the correct variable name is used
        val_metrics=val_metrics,  # Ensure the correct variable name is used
        DATASET=DATASET,
        BOT_FOLDERS=BOT_FOLDERS,
        BOT_RATIO=BOT_RATIO,
        MERGED_DATASET=MERGED_DATASET,
        TYPE_SELECTION=TYPE_SELECTION,
        TRAIN_RATE=TRAIN_RATE,
        TEST_RATE=TEST_RATE,
        VAL_RATE=VAL_RATE,
        MODEL=MODEL,
        FEATURES=FEATURES
    )




decision_tree


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


knn
logistic_regression




svm


In [9]:
pd.set_option('display.max_columns', None) 
results['svm']

Unnamed: 0,DATASET,BOT_FOLDERS,BOT_RATIO,MERGED_DATASET,TYPE_SELECTION,TRAIN_RATE,TEST_RATE,VAL_RATE,MODEL,FEATURES,test_Accuracy,test_TN Rate,test_FP Rate,test_FN Rate,test_TP Rate,test_Precision,test_Recall,test_F1 Score,test_MCC,test_AUC,val_Accuracy,val_TN Rate,val_FP Rate,val_FN Rate,val_TP Rate,val_Precision,val_Recall,val_F1 Score,val_MCC,val_AUC,C,break_ties,cache_size,class_weight,coef0,decision_function_shape,degree,gamma,kernel,max_iter,probability,random_state,shrinking,tol,verbose
0,cresci-2017,"[1, 1, 1]","[0.35, 0.65]",True,correlation,0.7,0.15,0.15,svm,,0.967672,0.334052,0.015086,0.017241,0.633621,0.976744,0.97351,0.975124,0.928981,0.992131,0.963441,0.335484,0.015054,0.021505,0.627957,0.976589,0.966887,0.971714,0.920137,0.984297,1000,False,200,balanced,0.1,ovr,2,scale,poly,-1,True,,True,0.001,False
