In [13]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
    
#tipo = 0 dataset con solo RFD
#tipo = 1 dataset+RFD 
#tipo = 2 tutto il dataset
def initialize_data(dataset, tipo, threshold, percentage = 100, dataset_num_column = 19):
    train_path = 'DatasetAfterFeatureSelection/'+dataset
    # print(train_path)
    # train_path = dataset

    # Read the dataset
    train_df = pd.read_csv(train_path,sep=';',encoding = "ISO-8859-1")
    train_df = train_df.fillna(1)

    print(train_df.head())

    print(train_df["class"])
    

    print("Dataset",dataset,"Letto!")
    
    last_n_columns  = train_df.iloc[: , -(len(train_df.axes[1])-dataset_num_column):]
    native_columns = train_df.iloc[: , :dataset_num_column]
    print("Selection of the last {} columns!".format(dataset_num_column))
    
    last_n_columns = last_n_columns.sort_values(by = len(train_df)-1, axis = 1)
    first_selected_columns = last_n_columns.iloc[: , :(int((len(last_n_columns.axes[1])*percentage)/100))]
    print("Sorting and selecting first column!")
    
    number_of_columns_to_select = 0
    occurrence = False

    for i in range(0, len(first_selected_columns.axes[1])):
        if first_selected_columns.iloc[len(train_df)-1, i] > threshold:
            occurrence = True
            number_of_columns_to_select = i-1
            break
    
    print("number_of_columns_to_select:",number_of_columns_to_select)
    if occurrence:
        final_selected_columns = first_selected_columns.iloc[: , :number_of_columns_to_select]
    else:
        final_selected_columns = first_selected_columns
        
    #print("final_selected_columns:",final_selected_columns)
    
    #-----------------------------------
    label_encoder = preprocessing.LabelEncoder()
    if tipo == 1:
        train_df = pd.concat([native_columns, final_selected_columns], axis=1)
        
        train_df = train_df.iloc[:-1 , :]
        
        print("Numero colonne: ", train_df.shape[1])
        Y = label_encoder.fit_transform(train_df["class"])
        del train_df["class"]
        
        #print("Label_encoder and other encoding completed!")
    elif tipo == 0:
        train_df = pd.concat([final_selected_columns, native_columns['class']], axis=1)
        train_df = train_df.iloc[:-1 , :]
        print("Numero colonne: ", train_df.shape[1])
        Y = label_encoder.fit_transform(train_df["class"])
        #print(train_df["class"])
        del train_df["class"]
    else:
        train_df = pd.concat([native_columns], axis=1)
        
        train_df = train_df.iloc[:-1 , :]
        print("Numero colonne: ", train_df.shape[1])
        Y = label_encoder.fit_transform(train_df["class"])
        del train_df["class"]
    #-----------------------------------
    
    x_train, x_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.2, random_state=42)
    return x_train, x_test, y_train, y_test

def knn_prediction():
    knn = KNeighborsClassifier(n_neighbors = 11, p=1, weights='distance', metric = 'cosine')
    # Fit the classifier to the data
    knn.fit(x_train,y_train)
    y_pred = knn.predict(x_test)
    accuracy_value = accuracy_score(y_test, y_pred)
    print("Accuracy for kNN on Test data: ",accuracy_value)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(report)

    conf_matrix = confusion_matrix(y_test, y_pred)
    # conf_matrix = confusion_matrix(y_test, y_pred, output_dict=True)
    print(conf_matrix)
    df_cm = pd.DataFrame(conf_matrix)
    
    return accuracy_value, report, df_cm

def RF_prediction(comparison=4):
    #new train with best hyperparameters
    if comparison == 1:
        max_features = 1
    elif comparison == 2:
        max_features = 2
    else:
        max_features = 3
    rfc1 = RandomForestClassifier(random_state = 42, bootstrap = True,
                                  criterion = 'entropy', 
                                  max_depth = 20, 
                                  max_features = max_features, 
                                  min_samples_leaf = 1, 
                                  min_samples_split = 10, 
                                  n_estimators = 500)

    # Fit the classifier to the data
    rfc1.fit(x_train, y_train)
    y_pred = rfc1.predict(x_test)

    accuracy_value = accuracy_score(y_test, y_pred)
    print("Accuracy for RF on Test data: ",accuracy_value)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(report)

    conf_matrix = confusion_matrix(y_test, y_pred)
    print(conf_matrix)
    df_cm = pd.DataFrame(conf_matrix)
    
    return accuracy_value, report, df_cm

def DT_prediction():
    #new train with best hyperparameters
    dt = DecisionTreeClassifier(random_state=42, max_leaf_nodes = 3, min_samples_split = 2, max_depth = 10, min_samples_leaf = 1)
    dt.fit(x_train, y_train)

    y_pred = dt.predict(x_test)
    
    accuracy_value = accuracy_score(y_test, y_pred)
    print("Accuracy for Decision Tree on Test data: ",accuracy_value)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(report)

    conf_matrix = confusion_matrix(y_test, y_pred)
    print(conf_matrix)
    df_cm = pd.DataFrame(conf_matrix)
    
    return accuracy_value, report, df_cm

# def GNB_prediction():
#     nb = GaussianNB(var_smoothing=1e-09)
#     nb.fit(x_train, y_train)

#     y_pred = nb.predict(x_test)
    
#     accuracy_value = accuracy_score(y_test, y_pred)
#     print("Accuracy for Gaussian Naive Bayes on Test data: ",accuracy_value)

#     report = classification_report(y_test, y_pred, output_dict=True)
#     print(report)

#     conf_matrix = confusion_matrix(y_test, y_pred)
#     print(conf_matrix)
#     df_cm = pd.DataFrame(conf_matrix)
    
#     return accuracy_value, report, df_cm

def SVC_prediction():
    svc = SVC(C = 1000, gamma = 0.01, kernel = 'rbf')
    svc.fit(x_train, y_train)

    y_pred = svc.predict(x_test)

    
    accuracy_value = accuracy_score(y_test, y_pred)
    print("Accuracy for SVC on Test data: ",accuracy_value)

    report = classification_report(y_test, y_pred, output_dict=True)
    print(report)

    conf_matrix = confusion_matrix(y_test, y_pred)
    print(conf_matrix)
    df_cm = pd.DataFrame(conf_matrix)
    
    return accuracy_value, report, df_cm

# def LR_prediction():
#     lr = LogisticRegression(solver='liblinear', C = 10, penalty = 'l1')
#     lr.fit(x_train, y_train)

#     y_pred = lr.predict(x_test)

#     accuracy_value = accuracy_score(y_test, y_pred)
#     print("Accuracy for Logistic Regression on Test data: ",accuracy_value)

#     report = classification_report(y_test, y_pred, output_dict=True)
#     print(report)

#     conf_matrix = confusion_matrix(y_test, y_pred)
#     print(conf_matrix)
#     df_cm = pd.DataFrame(conf_matrix)
    
#     return accuracy_value, report, df_cm

# def knn_prediction():
#     knn = KNeighborsClassifier(n_neighbors = 3, p=1, weights='distance')
#     # Fit the classifier to the data
#     knn.fit(x_train,y_train)
#     y_pred = knn.predict(x_test)
#     accuracy_value = accuracy_score(y_test, y_pred)
#     print("Accuracy for kNN on Test data: ",accuracy_value)
#     report = classification_report(y_test, y_pred, output_dict=True)
#     print(report)

#     conf_matrix = confusion_matrix(y_test, y_pred)
#     print(conf_matrix)
#     df_cm = pd.DataFrame(conf_matrix)
    
#     return accuracy_value, report, df_cm

def draw_confusion(df_cm, title, filename):
    plt.figure(figsize=(10, 7))
    sns.heatmap(df_cm, annot=True, fmt=".0f")
    plt.title(title)
    plt.savefig("{}-confusion.pdf".format(filename))
    #plt.show()

    plt.cla()
    plt.clf()

# 'KNN Confusion Matrix Correlation-Coefficient'

In [14]:
# full_datasets = ["rfds_0_with_chi.csv", 
#                  "rfds_1_with_chi.csv", 
#                  "rfds_2_with_chi.csv", "rfds_4_with_chi.csv", 
#                  "rfds_8_with_chi.csv", "rfds_12_with_chi.csv"]
import os

full_datasets = [f'{x}' for x in os.listdir('DatasetAfterFeatureSelection/')]

dataframes_results = []
results_dict = {}

for dataset_name in full_datasets:
    tipo = 1
    print(dataset_name+'\n')
    if tipo != 2: comparison_thr = dataset_name.split("_")[1]
    else: comparison_thr = -1
    for thr in [0.1, 0.2, 0.3, 0.4, 0.5]:
    # for thr in [0.1]:
        x_train, x_test, y_train, y_test = initialize_data(dataset_name, tipo, thr)

        if x_train.shape[1] < 3:
            accuracy_value_RF, report_RF, df_cm_RF = RF_prediction(x_train.shape[1])
        else:
            accuracy_value_RF, report_RF, df_cm_RF = RF_prediction()

        # print(f'df_cm_RF: {df_cm_RF}')   
        #draw_confusion(df_cm_RF, 'RF Confusion Matrix Correlation-Coefficient', "RF_thr_{}".format(thr))
        results_dict['RF_{}_{}_{}'.format(dataset_name, str(thr).replace(".","-"), tipo)] = report_RF
        report_RF['dataset'] = dataset_name
        report_RF['comparison_thr'] = comparison_thr
        report_RF['model'] = 'RF'
        report_RF['thr'] = thr
        report_RF['tipo'] = tipo
        report_RF['accuracy_value'] = accuracy_value_RF
        report_RF['TP'] = df_cm_RF[0][0]
        report_RF['FP'] = df_cm_RF[0][1]
        report_RF['FN'] = df_cm_RF[1][0]
        report_RF['TN'] = df_cm_RF[1][1]

        accuracy_value_DT, report_DT, df_cm_DT = DT_prediction()
        #draw_confusion(df_cm_DT, 'DT Confusion Matrix Correlation-Coefficient', "DT_thr_{}".format(thr))
        results_dict['DT_{}_{}_{}'.format(dataset_name, str(thr).replace(".","-"), tipo)] = report_DT
        report_DT['dataset'] = dataset_name
        report_DT['comparison_thr'] = comparison_thr
        report_DT['model'] = 'DT'
        report_DT['thr'] = thr
        report_DT['tipo'] = tipo
        report_DT['accuracy_value'] = accuracy_value_DT
        report_DT['TP'] = df_cm_DT[0][0]
        report_DT['FP'] = df_cm_DT[0][1]
        report_DT['FN'] = df_cm_DT[1][0]
        report_DT['TN'] = df_cm_DT[1][1]

        # accuracy_value_GNB, report_GNB, df_cm_GNB = GNB_prediction()
        # #draw_confusion(df_cm_GNB, 'GNB Confusion Matrix Correlation-Coefficient', "GNB_thr_{}".format(thr))
        # results_dict['GNB_{}_{}_{}'.format(dataset_name, str(thr).replace(".","-"), tipo)] = report_GNB
        # report_GNB['dataset'] = dataset_name
        # report_GNB['comparison_thr'] = comparison_thr
        # report_GNB['model'] = 'GNB'
        # report_GNB['thr'] = thr
        # report_GNB['tipo'] = tipo
        # report_GNB['accuracy_value'] = accuracy_value_GNB
        # report_GNB['TP'] = df_cm_GNB[0][0]
        # report_GNB['FP'] = df_cm_GNB[0][1]
        # report_GNB['FN'] = df_cm_GNB[1][0]
        # report_GNB['TN'] = df_cm_GNB[1][1]

        accuracy_value_SVC, report_SVC, df_cm_SVC = SVC_prediction()
        #draw_confusion(df_cm_SVC, 'SVC Confusion Matrix Correlation-Coefficient', "SVC_thr_{}".format(thr))
        results_dict['SVC_{}_{}_{}'.format(dataset_name, str(thr).replace(".","-"), tipo)] = report_SVC
        report_SVC['dataset'] = dataset_name
        report_SVC['comparison_thr'] = comparison_thr
        report_SVC['model'] = 'SVC'
        report_SVC['thr'] = thr
        report_SVC['tipo'] = tipo
        report_SVC['accuracy_value'] = accuracy_value_SVC
        report_SVC['TP'] = df_cm_SVC[0][0]
        report_SVC['FP'] = df_cm_SVC[0][1]
        report_SVC['FN'] = df_cm_SVC[1][0]
        report_SVC['TN'] = df_cm_SVC[1][1]

        # accuracy_value_LR, report_LR, df_cm_LR = LR_prediction()
        # #draw_confusion(df_cm_LR, 'LR Confusion Matrix Correlation-Coefficient', "LR_thr_{}".format(thr))
        # results_dict['LR_{}_{}_{}'.format(dataset_name, str(thr).replace(".","-"), tipo)] = report_LR
        # report_LR['dataset'] = dataset_name
        # report_LR['comparison_thr'] = comparison_thr
        # report_LR['model'] = 'LR'
        # report_LR['thr'] = thr
        # report_LR['tipo'] = tipo
        # report_LR['accuracy_value'] = accuracy_value_LR
        # report_LR['TP'] = df_cm_LR[0][0]
        # report_LR['FP'] = df_cm_LR[0][1]
        # report_LR['FN'] = df_cm_LR[1][0]
        # report_LR['TN'] = df_cm_LR[1][1]
        
        accuracy_value_KNN, report_KNN, df_cm_KNN = knn_prediction()
        #draw_confusion(df_cm_KNN, 'KNN Confusion Matrix Correlation-Coefficient', "KNN_thr_{}".format(thr))
        results_dict['KNN_{}_{}_{}'.format(dataset_name, str(thr).replace(".","-"), tipo)] = report_KNN
        report_KNN['dataset'] = dataset_name
        report_KNN['comparison_thr'] = comparison_thr
        report_KNN['model'] = 'KNN'
        report_KNN['thr'] = thr
        report_KNN['tipo'] = tipo
        report_KNN['accuracy_value'] = accuracy_value_KNN
        report_KNN['TP'] = df_cm_KNN[0][0]
        report_KNN['FP'] = df_cm_KNN[0][1]
        report_KNN['FN'] = df_cm_KNN[1][0]
        report_KNN['TN'] = df_cm_KNN[1][1]
    #break

user_fake_authentic_2class_50000_threshold_4_EXTENDED_0.3_with_chi.csv

   Attr0  Attr1   Attr2  Attr3  Attr4  Attr5  Attr6     Attr7  Attr8  \
0  173.0  602.0  2300.0   20.0    1.0    0.0  133.0  0.500000  0.222   
1    0.0  384.0  2000.0    0.0    1.0    0.0    0.0  0.000000  0.000   
2   34.0  125.0   359.0    0.0    1.0    0.0   15.0  0.388889  0.056   
3   48.0  383.0   732.0    0.0    1.0    0.0   50.0  0.166667  0.111   
4    8.0   71.0  7500.0   21.0    1.0    0.0    4.0  0.750000  0.375   

       Attr9  ...    2926    2927    2928    2929    2930    2931    2932  \
0   1.280000  ...  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000   
1   0.000000  ...  0.8796  0.9810  0.8392  0.9144  0.9092  0.9202  0.9096   
2  16.270000  ...  0.9726  0.9954  0.9772  0.9998  0.9952  0.9998  0.9968   
3   5.270000  ...  0.9822  0.9986  0.9884  1.0000  0.9924  0.9998  0.9954   
4  38.560001  ...  0.8700  0.9646  0.8730  1.0000  0.9958  1.0000  0.9964   

     2933    2934    2935  
0  1


KeyboardInterrupt



In [None]:
results_dict

In [None]:
dfs = []
for val in results_dict:
    d = pd.json_normalize(results_dict[val])
    dfs.append(d)

In [None]:
dftmp = pd.concat(dfs)
dftmp

In [None]:
# dftmp.to_csv("Results_instagram_bilanciato_RFD+9_tipo1.csv",index=None,sep=";")
# dftmp.to_csv("Results_instagram_non_bilanciato_RFD+9_tipo0.csv",index=None,sep=";")
# dftmp.to_csv("Results_instagram_non_bilanciato_tipo2.csv",index=None,sep=";")
dftmp.to_csv("Results_instagram_bilanciato_tipo2.csv",index=None,sep=";")