# Porovnanie klasifikátorov strojového učenia

In [1]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

### Funkcia na výpočet priemeru

In [6]:
def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

### Funkcia ktorá trénuje a testuje algoritmy SU

In [2]:
def run(k, labels, classifier):
    accuracy = []
    precision = []
    recall = []
    f1 = []

    for j in range(k):
        data_train = pd.read_csv('norm_sets/train/train_data%d.csv' % j)
        data_test = pd.read_csv('norm_sets/test/test_data%d.csv' % j)

        x_train = data_train[labels]
        x_test = data_test[labels]
        y_train = data_train['hateLabel']
        y_test = list(data_test['hateLabel'])

        classifier.fit(x_train, y_train)
        y_pred = list(classifier.predict(x_test))

        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))

    mean_eval = {
        'accuracy': mean(accuracy),
        'precision': mean(precision),
        'recall': mean(recall),
        'f1_score': mean(f1)
    }

    return mean_eval

### Funkcia na otestovanie všetkých algoritmov

In [8]:
def test_all_classifiers(k, labels):

    print('classifier,accuracy,precision,recall,f1\n')

    classifiers = {
        'dc': DummyClassifier(),
        'rfc': RandomForestClassifier(
            n_estimators=40,
            n_jobs=-1,
            max_features='log2'),
#         'abc': AdaBoostClassifier(
#             base_estimator=ExtraTreesClassifier(
#                 n_estimators=200,
#                 n_jobs=-1,
#                 max_features='sqrt',
#                 min_samples_leaf=1,
#                 max_depth=5),
#             n_estimators=200),
        'bc': BaggingClassifier(n_jobs=-1),
        'etc': ExtraTreesClassifier(
            n_estimators=200,
            n_jobs=-1,
            max_features='sqrt',
            min_samples_leaf=1,
            max_depth=5),
        'gbc': GradientBoostingClassifier(),
        'lr': LogisticRegression(n_jobs=-1),
        'pac': PassiveAggressiveClassifier(n_jobs=-1),
        'rc': RidgeClassifier(),
        'mnb': MultinomialNB(),
        'bnb': BernoulliNB(),
        'knc': KNeighborsClassifier(n_jobs=-1),
        'nc': NearestCentroid(),
        'mlp': MLPClassifier(
            hidden_layer_sizes=(100,),
            activation='relu',
            solver='adam',
            alpha=0.0002,
            learning_rate='invscaling',
            max_iter=300),
        'dtc': DecisionTreeClassifier(),
        'etc2': ExtraTreeClassifier(),
        'svc_rbf': SVC(),
    }
    
    for classifier_key, classifier in sorted(classifiers.items()):
        mean_eval = run(k=k, labels=labels, classifier=classifier)

        print('%s,%s,%s,%s,%s\n' % (
            classifier_key,
            mean_eval['accuracy'],
            mean_eval['precision'],
            mean_eval['recall'],
            mean_eval['f1_score']))

### Voľba čŕt

In [4]:
labels = [
        # 'after', #H
        'badWordsCount',
        'badWordsRatio',
        # 'before', #H
        'capitalLetterRatio',
        'capitalWordRatio',
        'diversityScore',
        'emoticonCount',
        'emoticonToWordRatio',
        'firstPronounsCount',
        'firstPronounsToWordRatio',
        'hatewordsCount',
        'hatewordsRatio',
        'insultsCount',
        'insultsRatio',
        # 'isComment', #H
        'negativeCoefficient',
        'neutralCoefficient',
        'positiveCoefficient',
        # 'profanityWindow2',
        # 'profanityWindow3',
        # 'profanityWindow4',
        # 'profanityWindow5',
        'punctuationRatio',
        'readabilityScore',
        'secondPronounsCount',
        'secondPronounsToWordRatio',
        'sentimentLabel',
        'textDisplayLength',
        'textDisplayProcessedLength',
        'textDisplayProcessedWordsCount',
        'textDisplayWordsCount',
        'totalReplyCount',
        # 'diffBadWordsCount', #H
        # 'diffBadWordsRatio', #H
        # 'diffCapitalLetterRatio', #H
        # 'diffCapitalWordRatio', #H
        # 'diffEmoticonCount', #H
        # 'diffEmoticonToWordRatio', #H
        # 'diffFirstPronounsCount', #H
        # 'diffFirstPronounsToWordRatio', #H
        # 'diffHatewordsCount', #H
        # 'diffHatewordsRatio', #H
        # 'diffInsultsCount', #H
        # 'diffInsultsRatio', #H
        # 'diffNegativeCoefficient', #H
        # 'diffNeutralCoefficient', #H
        # 'diffPositiveCoefficient', #H
        # 'diffProfanityWindow2', #H
        # 'diffProfanityWindow3', #H
        # 'diffProfanityWindow4', #H
        # 'diffProfanityWindow5', #H
        # 'diffPunctuationRatio', #H
        # 'diffSecondPronounsCount', #H
        # 'diffSecondPronounsToWordRatio', #H
        # 'diffTextDisplayLength', #H
        # 'diffTextDisplayProcessedLength', #H
        # 'diffTextDisplayProcessedWordsCount', #H
        # 'diffTextDisplayWordsCount', #H
        # 'threadAnger', #H
        # 'threadDisgust', #H
        # 'threadFear', #H
        # 'threadJoy', #H
        # 'threadSadness', #H
        # 'threadAnalytical', #H
        # 'threadConfident', #H
        # 'threadTentative', #H
        # 'threadOpennessBig5', #H
        # 'threadConscientiousnessBig5', #H
        # 'threadExtraversionBig5', #H
        # 'threadAgreeablenessBig5', #H
        # 'threadEmotionalRangeBig5', #H
        'anger',
        'disgust',
        'fear',
        'joy',
        'sadness',
        'analytical',
        'confident',
        'tentative',
        'opennessBig5',
        'conscientiousnessBig5',
        'extraversionBig5',
        'agreeablenessBig5',
        'emotionalRangeBig5',
        'likeCount',
        # 'diffLikeCount', #H
        'userAverageBadWordsCount',
        'userAverageBadWordsRatio',
        'userAverageCapitalLetterRatio',
        'userAverageCapitalWordRatio',
        'userAverageEmoticonCount',
        'userAverageEmoticonToWordRatio',
        'userAverageFirstPronounsCount',
        'userAverageFirstPronounsToWordRatio',
        'userAverageHatewordsCount',
        'userAverageHatewordsRatio',
        'userAverageInsultsCount',
        'userAverageInsultsRatio',
        'userAverageLikeCount',
        'userAverageNegativeCoefficient',
        'userAverageNeutralCoefficient',
        'userAveragePositiveCoefficient',
        'userAverageProfanityWindow2',
        # 'userAverageProfanityWindow3',
        # 'userAverageProfanityWindow4',
        # 'userAverageProfanityWindow5',
        'userAveragePunctuationRatio',
        'userAverageSecondPronounsCount',
        'userAverageSecondPronounsToWordRatio',
        'userAverageTextDisplayLength',
        'userAverageTextDisplayProcessedLength',
        'userAverageTextDisplayProcessedWordsCount',
        'userAverageTextDisplayWordsCount',
        'userNumberOfComments',
        'userNumberOfContent',
        'userNumberOfReplies'
    ]

### Zavolanie funkcie
Môže trvať niekoľko minút

In [9]:
test_all_classifiers(k=10, labels=labels)

classifier,accuracy,precision,recall,f1

bc,0.7801038623823433,0.49269841269841275,0.3060344488672662,0.3703773497866451

bnb,0.6495456020772477,0.34472696652629764,0.6666412354415451,0.44970485927415976

dc,0.49870172022070747,0.21506241392944775,0.4944172932330827,0.29891261366044886

dtc,0.7417721518987342,0.39167671046591496,0.4002220625092142,0.39210757562351334

etc,0.7775073028237586,0.5077620513301628,0.6088762347044081,0.5431337197564968

etc2,0.7352482960077898,0.36786435786435784,0.3177427023440956,0.3299345737276772

gbc,0.800519311911717,0.5519329100850839,0.5361819991154356,0.5347536230299457

knc,0.60238558909445,0.2719005602240897,0.4965216963978573,0.3483383587031003

lr,0.7070431678026616,0.40739427552886626,0.6657214727996461,0.4969146321452791





mlp,0.7711619604024668,0.48239112710474946,0.5191291341097843,0.48956961650684805

mnb,0.704479065238559,0.3972890899949723,0.6414450955820924,0.48399478847155547

nc,0.646965271015904,0.3355098820251622,0.6275352597179222,0.43395181779343267

pac,0.5248296007789678,0.36282386415501616,0.6593522408963586,0.34857049706611104

rc,0.717315806556313,0.4170543208575663,0.6713260479630448,0.5068632508840326

rfc,0.8223141837065887,0.7137606837606837,0.35077122954444934,0.44834160785290056

svc_rbf,0.6532943849399545,0.34754536701924843,0.6472125165855817,0.4477720638137449

