# Ukážka testovania pomocou co-trainingu

In [18]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import pandas as pd
from classifiers import CoTrainingClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

### Funkcia na testovanie pomocou co-trainingu

In [22]:
def cotrain(labels_1, labels_2, name_1, name_2, classifier_1, classifier_2, iterations=1, u=75, p=1, n=1, k_i=30):
    accuracy = 0.0
    precision = 0.0
    recall = 0.0
    f1 = 0.0

    k = 10

    for idx in range(k):
        train_file = 'cotrain/train/cotrain_data_train_%d.csv' % idx
        test_file = 'cotrain/test/cotrain_data_test_%d.csv' % idx
        
        train_data = pd.read_csv(train_file)
        test_data = pd.read_csv(test_file)

        for i in range(iterations):

            y_test = test_data['hateLabel'].values
            y = train_data['hateLabel'].values

            X1 = train_data[labels_1].values
            X2 = train_data[labels_2].values
            X1_test = test_data[labels_1].values
            X2_test = test_data[labels_2].values

            lg_co_clf = CoTrainingClassifier(classifier_1, classifier_2, u=u, p=p, n=n, k=k_i)
            lg_co_clf.fit(X1, X2, y)
            y_pred = lg_co_clf.predict(X1_test, X2_test)

            TP, FP, TN, FN = perf_measure(y_test, y_pred)

            accuracy += accuracy_score(y_test, y_pred)
            precision += precision_score(y_test, y_pred)
            recall += recall_score(y_test, y_pred)
            temp_f1 = f1_score(y_test, y_pred)
            f1 += temp_f1

            print(i)

    print('Accuracy:\t' + str(accuracy / (iterations * k)) + '\n')
    print('Precision:\t' + str(precision / (iterations * k)) + '\n')
    print('Recall:\t\t' + str(recall / (iterations * k)) + '\n')
    print('F1 score:\t' + str(f1 / (iterations * k)) + '\n\n')

### Funkcia na zavolanie co-trainingu pre špecifikované algoritmy, parametre a črty

In [20]:
def cotrain_play(u, p, n, k_i):
    labels_1 = [
        'badWordsCount',
        'badWordsRatio',
        'capitalLetterRatio',
        'capitalWordRatio',
        'diversityScore',
        'firstPronounsCount',
        'hatewordsCount',
        'hatewordsRatio',
        'insultsCount',
        'insultsRatio',
        'neutralCoefficient',
        'punctuationRatio',
        'readabilityScore',
        'secondPronounsCount',
        'secondPronounsToWordRatio',
        'sentimentLabel',
        'textDisplayLength',
        'textDisplayProcessedLength',
        'textDisplayProcessedWordsCount',
        'textDisplayWordsCount',
        'anger',
        'disgust',
        'fear',
        'joy',
        'sadness',
        'analytical',
        'confident',
        'tentative'
    ]

    labels_2 = [
        'likeCount',
        'totalReplyCount',
        'diffLikeCount',
        'userAverageBadWordsCount',
        'userAverageBadWordsRatio',
        'userAverageCapitalLetterRatio',
        'userAverageCapitalWordRatio',
        'userAverageEmoticonCount',
        'userAverageEmoticonToWordRatio',
        'userAverageFirstPronounsCount',
        'userAverageFirstPronounsToWordRatio',
        'userAverageHatewordsCount',
        'userAverageHatewordsRatio',
        'userAverageInsultsCount',
        'userAverageInsultsRatio',
        'userAverageLikeCount',
        'userAverageNegativeCoefficient',
        'userAverageNeutralCoefficient',
        'userAveragePositiveCoefficient',
        'userAverageProfanityWindow2',
        'userAveragePunctuationRatio',
        'userAverageSecondPronounsCount',
        'userAverageSecondPronounsToWordRatio',
        'userAverageTextDisplayLength',
        'userAverageTextDisplayProcessedLength',
        'userAverageTextDisplayProcessedWordsCount',
        'userAverageTextDisplayWordsCount',
        'userNumberOfComments',
        'userNumberOfContent',
        'userNumberOfReplies'
    ]

    i = 0

    name_1 = 'ExtraTreesClassifier'
    name_2 = 'AdaBoostClassifier - ExtraTreesClassifier'

    classifier_1 = ExtraTreesClassifier(n_estimators=200,
                                        n_jobs=-1,
                                        max_features='sqrt',
                                        min_samples_leaf=1,
                                        max_depth=5)

    classifier_2 = AdaBoostClassifier(base_estimator=ExtraTreesClassifier(n_estimators=200,
                                                                          n_jobs=-1,
                                                                          max_features='sqrt',
                                                                          min_samples_leaf=1,
                                                                          max_depth=5),
                                      n_estimators=200)

    cotrain(labels_1, labels_2, name_1, name_2, classifier_1, classifier_2, iterations=1, u=u, p=p, n=n, k_i=k_i)

### Funkcia na výpočet confusion matrix

In [24]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)):
        if y_actual[i] == y_hat[i] == 1:
            TP += 1
    for i in range(len(y_hat)):
        if y_hat[i] == 1 and y_actual[i] != y_hat[i]:
            FP += 1
    for i in range(len(y_hat)):
        if y_actual[i] == y_hat[i] == 0:
            TN += 1
    for i in range(len(y_hat)):
        if y_hat[i] == 0 and y_actual[i] != y_hat[i]:
            FN += 1

    return TP, FP, TN, FN

### Zavolanie funkcie
Výpočet môže trvať niekoľko minút. Vústupom sú hodnoty accuracy, precision, recall, F1.

In [25]:
cotrain_play(1000, 8, 2, 1)

0
0
0
0
0
0
0
0
0
0
Accuracy:	0.756762820513

Precision:	0.474974707396

Recall:		0.676470588235

F1 score:	0.553776148837


