In [57]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [58]:
models = ["lr", "svm", "rf", "nb"]
representations = ["character n-grams", "word unigrams"]
n_folds = 10

In [59]:
def get_data(fname):
    df = pd.read_csv("../preprocessing/WikiPages/wikipedia_preprocessed.csv", index_col="rev_id")
    textdata = df["comment"].fillna("").values
    labels = df["attack"].astype(int).values
    return textdata, labels

In [60]:
def get_model(name):
    if name == "lr":
        model = LogisticRegression()
    elif name == "svm":
        model = LinearSVC()
    elif name == "rf":
        model = RandomForestClassifier(n_jobs=-1)
    elif name == "nb":
        model = MultinomialNB()
    else:
        return None
    return model

def evaluate_model(modelname, features, labels):
    model = get_model(modelname)
    kfoldcv = KFold(n_splits = n_folds)
    scores = []
    n = 0
    for train_index, test_index in kfoldcv.split(features):
        n += 1
        print(n)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model.fit(X=X_train, y=y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        scores.append(f1)
    return np.mean(scores)

In [61]:
textdata, labels = get_data("data/wiki_data.pkl")

results = []
for rep in representations:
    if rep == "word unigrams":
        transformer = CountVectorizer(analyzer="word", stop_words="english")
    else:
        transformer = CountVectorizer(analyzer = "char", ngram_range=(1,2))
    count_textdata = transformer.fit_transform(textdata)
    freq_transfomer = TfidfTransformer()
    freq_textdata = freq_transfomer.fit_transform(count_textdata)
    features = freq_textdata

    scores = []
    for modelname in models:
        print(rep, modelname)
        meanf1 = evaluate_model(modelname, features, labels)
        scores.append(meanf1)

    results.append(pd.Series(scores, index=models))

df = pd.concat(results, axis=0, keys=representations)

character n-grams lr
1
2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


character n-grams svm
1
2
3
4
5
6
7
8
9
10
character n-grams rf
1
2
3
4
5
6
7
8
9
10
character n-grams nb
1
2
3
4
5
6
7
8
9
10
word unigrams lr
1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


2
3
4
5
6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


7
8
9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


10
word unigrams svm
1
2
3
4
5
6
7
8
9
10
word unigrams rf
1
2
3
4
5
6
7
8
9
10
word unigrams nb
1
2
3
4
5
6
7
8
9
10


In [66]:
df.name = "F1_WikiPages"
df.to_frame().to_csv("results_wiki.csv", index_label=["processing", "model"])