In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from xgboost.sklearn import XGBClassifier

from NiaPy.algorithms.basic import GreyWolfOptimizer
from NiaPy.task import Task, StoppingTask, OptimizationType
from NiaPy.benchmarks import Benchmark

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from pprint import pprint
import time

model = "C:/Users/miska/OneDrive/Dokumenty/pripoc-projekt/model/"

In [2]:
social = pd.read_csv(model + 'social_model_all_features.csv', sep = ',')
text = pd.read_csv(model + 'text_model_all_features.csv', sep = ',')

In [3]:
# we are going to drop the label from the text models, because we have it in the social ones
text = text.drop(['label'], axis=1)

# Model with all social and all text features

In [4]:
combined = social.merge(text, on='userId')

In [5]:
combined

Unnamed: 0,userId,louvain,labelPropagation,triangles,clustCoeff,scc,unionFind,betweeness_out,betweeness_both,closeness,...,http://www.addictinginfo.org,http://www.chicksontheright.com,http://www.ifyouonlynews.com,http://www.opposingviews.com,http://www.proudcons.com,http://www.thepoliticalinsider.com,http://www.yesimright.com,https://goo.gl,https://ihavethetruth.com,https://www.washingtonpost.com
0,1,15169,0,2,333,0,0,0,27,368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,9809,32,292,69,1,0,49276,4786,388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,15169,21,0,0,2,0,0,0,321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,10574,32,10780,31,1,0,315057,55039,393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,15169,9238,0,0,4,0,227,215,296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15252,15253,15169,32,42,166,1,0,150,7001,434,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15253,15254,10574,32,7,106,1,0,5448,280,345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15254,15255,15169,15285,4,400,15254,0,0,5,380,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15255,15256,15169,82,7,194,1,0,802,117,387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Splitting into train and test

In [6]:
# excluding some features from being in the 'X'
X_columns = combined.columns.difference(['userId', 'label', 'followingList', 'followersList', 'source'])

X = combined[X_columns] 
y = combined['label']

number_of_cols = len(X_columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 42)
X_train

Unnamed: 0,authors_len,avg_authors_len,avg_numbers_cnt_text,avg_numbers_cnt_title,avg_text_len,avg_text_special,avg_title_len,avg_title_special,avg_wcount_text,avg_wcount_title,...,special_text,special_title,text_len,title_len,triangles,unionFind,uppercount_text,uppercount_title,wordcount_text,wordcount_title
777,6,6.0,45.0,0.0,19923.0,12.0,63.0,0.0,3400.0,11.0,...,12,0,19923,63,235,0,482,12,3400,11
11042,7,7.0,0.0,0.0,2236.0,3.0,52.0,0.0,379.0,8.0,...,3,0,2236,52,16,0,79,8,379,8
12724,2,2.0,1.0,0.0,1024.0,0.0,73.0,0.0,174.0,12.0,...,0,0,1024,73,16,0,45,3,174,12
6138,0,0.0,3.0,0.0,2028.0,2.0,57.0,0.0,378.0,11.0,...,2,0,2028,57,27,0,60,14,378,11
4019,1,1.0,0.0,0.0,1986.0,0.0,70.0,0.0,326.0,12.0,...,0,0,1986,70,570,0,38,12,326,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,0,0.0,9.5,0.0,1436.5,6.0,54.0,0.0,260.5,9.0,...,12,0,2873,108,229,0,115,21,521,18
13418,2,2.0,8.0,0.0,1627.0,1.0,68.0,0.0,261.0,11.0,...,1,0,1627,68,8,0,60,3,261,11
5390,3,3.0,19.0,0.0,3346.0,1.0,60.0,0.0,532.0,9.0,...,1,0,3346,60,0,0,129,9,532,9
860,4,4.0,20.0,0.0,4150.0,7.0,66.0,0.0,688.0,10.0,...,7,0,4150,66,0,0,163,12,688,10


## XG Boost Classifier

In [7]:
xg = XGBClassifier()
xg.fit(X_train,y_train)
y_pred = xg.predict(X_test)
print(classification_report(y_test, y_pred))
print("macro f1-score =", f1_score(y_test, y_pred, average='macro'))
print(confusion_matrix(y_test, y_pred))
list(sorted(zip(X_columns, xg.feature_importances_), key=lambda x: x[1], reverse=True))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2215
           1       1.00      1.00      1.00      2363

    accuracy                           1.00      4578
   macro avg       1.00      1.00      1.00      4578
weighted avg       1.00      1.00      1.00      4578

macro f1-score = 0.9982508762334465
[[2215    0]
 [   8 2355]]


[('http://politi.co', 0.25043917),
 ('avg_title_len', 0.24002571),
 ('http://abcn.ws', 0.08784225),
 ('http://cnn.it', 0.06295278),
 ('title_len', 0.033833005),
 ('ratio_case_title', 0.03373019),
 ('uppercount_title', 0.027973246),
 ('avg_wcount_text', 0.019595912),
 ('http://conservativetribune.com', 0.015160506),
 ('communityCentroidLabel_scc', 0.01496413),
 ('http://rightwingnews.com', 0.014865936),
 ('http://addictinginfo.org', 0.014710754),
 ('http://www.addictinginfo.org', 0.013784424),
 ('http://www.opposingviews.com', 0.013694886),
 ('http://www.thepoliticalinsider.com', 0.012159074),
 ('http://occupydemocrats.com', 0.011781814),
 ('http://allenwestrepublic.com', 0.010868457),
 ('lowercount_title', 0.0104935365),
 ('avg_authors_len', 0.009561247),
 ('avg_text_special', 0.008063219),
 ('http://www.ifyouonlynews.com', 0.007911399),
 ('http://author.groopspeak.com', 0.007327153),
 ('http://freedomdaily.com', 0.007308717),
 ('lowercount_text', 0.0060934406),
 ('ratio_following', 0.

# Aplying GWO over classification issue

In [8]:
number_of_cols

88

In [9]:
def get_scores_under_threshold(values, thr):
    indexes = []
    for idx, val in enumerate(values):
        if val < thr:
            indexes.append(idx)
    return indexes

In [10]:
class FeatureSelectionBenchmark(Benchmark):
    def __init__(self, threshold):
        self.Lower = 0
        self.Upper = 1
        self.threshold = threshold
        super().__init__(self.Lower, self.Upper)

    def function(self):
        def evaluate(D, solution):
            keep_index = get_scores_under_threshold(solution, self.threshold)
                    
            X_train_new = X_train.iloc[:, keep_index]
            X_test_new = X_test.iloc[:, keep_index]
                                
            if X_train_new.shape[1] > 0:  # Check if no features were selected
                classifier = XGBClassifier()
                classifier.fit(X_train_new, y_train)
                y_pred = classifier.predict(X_test_new)
                # TODO vylepsit fitness podla clanku binary GWO
                fitness = f1_score(y_test, y_pred, average='macro')
                # print(len(keep_index), ' - ', (1 - fitness))
                inverted_fitness = (1 - fitness)
                return inverted_fitness
            else:
                return math.inf
        return evaluate

In [11]:
class FeatureSelectionOptimizer():
    def __init__(self, benchmark = FeatureSelectionBenchmark, threshold = 0.5, num_iter=5, num_eval=10):
        self.benchmark = benchmark
        self.threshold = threshold
        self.num_iter = num_iter
        self.num_eval = num_eval

    def run(self):
        # D (int): Dimension of the problem. - should be the number of features
        # nFES (int): Maximum number of function evaluations.
        # NP : population size
        # using inverted fitness function, because optType=OptimizationType.MAXIMIZATION is not working

        start = time.time()

        best_scores = []
        best_f1 = 0

        for i in range (0, self.num_iter):
            task = StoppingTask(D=number_of_cols, nFES=self.num_eval, 
                                optType=OptimizationType.MINIMIZATION, 
                                benchmark=FeatureSelectionBenchmark(self.threshold))
            algo = GreyWolfOptimizer(NP=40)
            scores, f1 = algo.run(task)
            selected_scores = [x for x in scores if x < self.threshold]
            inverted_f1 = (1 - f1)
            print(i, '. run =', len(selected_scores), 'features -', inverted_f1)

            if (inverted_f1 > best_f1):
                best_f1 = inverted_f1
                best_scores = scores
                
        end = time.time()

        indexes = get_scores_under_threshold(best_scores, self.threshold)
        names_columns = list(X_train.iloc[:, indexes].columns.values)
        return best_f1, len(names_columns), names_columns, end - start

## Default number of runs

In [12]:
opti = FeatureSelectionOptimizer()
f1, number_columns, names_columns, exe_time = opti.run()
print('---------------------------------------')
print('BEST =', number_columns, 'FEATURES - ', f1)
print('executed time =', exe_time, 'sec')
print('---------------------------------------')
pprint(names_columns)

0 . run = 35 features - 0.9980322093543559
1 . run = 47 features - 0.9982508762334465
2 . run = 49 features - 0.9980319912945635
3 . run = 41 features - 0.9982507813310453
4 . run = 48 features - 0.9982508291176966
---------------------------------------
BEST = 47 FEATURES -  0.9982508762334465
executed time = 68.38285565376282 sec
---------------------------------------
['authors_len',
 'avg_numbers_cnt_text',
 'avg_text_len',
 'avg_title_len',
 'avg_wcount_text',
 'avg_wcount_title',
 'avg_word_len_text',
 'avg_word_len_title',
 'avg_word_title',
 'betweeness_both',
 'closeness',
 'communityCentroidLabel_labelPropagation',
 'communityCentroidLabel_louvain',
 'communityCentroidLabel_scc',
 'communityCentroidLabel_unionFind',
 'connectFollowersFakeCount',
 'connectFollowingFakeCount',
 'eigenVector',
 'followSum',
 'followers_quantile',
 'following',
 'following_quantile',
 'http://100percentfedup.com',
 'http://abcn.ws',
 'http://clashdaily.com',
 'http://conservativetribune.com',
 'h

## More runs

In [13]:
opti = FeatureSelectionOptimizer(num_iter=10, num_eval=20)
f1, number_columns, names_columns, exe_time = opti.run()
print('---------------------------------------')
print('BEST =', number_columns, 'FEATURES - ', f1)
print('executed time =', exe_time, 'sec')
print('---------------------------------------')
pprint(names_columns)

0 . run = 50 features - 0.9978134766638067
1 . run = 55 features - 0.9982508291176966
2 . run = 51 features - 0.9980321559717704
3 . run = 45 features - 0.9975947911308842
4 . run = 49 features - 0.9980322093543559
5 . run = 52 features - 0.9984694546447104
6 . run = 44 features - 0.9982507813310453
7 . run = 51 features - 0.9982508291176966
8 . run = 54 features - 0.9982507328733833
9 . run = 45 features - 0.9986880496550374
---------------------------------------
BEST = 45 FEATURES -  0.9986880496550374
executed time = 295.2545795440674 sec
---------------------------------------
['avg_authors_len',
 'avg_numbers_cnt_title',
 'avg_title_len',
 'avg_word_title',
 'closeness',
 'clustCoeff',
 'communityCentroidLabel_cnm',
 'communityCentroidLabel_labelPropagation',
 'communityCentroidLabel_scc',
 'connectFollowingFakeCount',
 'connectFollowingRealCount',
 'eigenVector',
 'followSum',
 'followers_quantile',
 'http://100percentfedup.com',
 'http://abcn.ws',
 'http://author.addictinginfo.