In [7]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [8]:
def get_data(path, id_col):
    
    data = pd.read_csv(path, index_col = id_col)
    x_text = data['comment'].fillna("").values
    labels = data['attack'].values
    return x_text, labels

In [10]:
path = '../data/formspring_data_preprocessed.csv'
id_col = 'userid'
x_text, labels = get_data(path, id_col)

In [26]:
representations = ["character n-grams", "word unigrams"]
models = ["lr", "svm", "rf", "nb"]
n_folds = 10

In [27]:
def get_model(name):
    if name == "lr":
        model = LogisticRegression()
    elif name == "svm":
        model = LinearSVC()
    elif name == "rf":
        model = RandomForestClassifier(n_jobs=-1)
    elif name == "nb":
        model = MultinomialNB()
    else:
        return None
    return model

In [28]:
def train_eval(modelname, features, labels):
    model = get_model(modelname)
    kfoldcv = KFold(n_splits = n_folds)
    scores = []
    n = 0
    for train_index, test_index in kfoldcv.split(features):
        n += 1
        print(n)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model.fit(X=X_train, y=y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        scores.append(f1)
    return np.mean(scores)

In [30]:
results = []

for rep in representations:
    if rep == "word unigrams":
        transformer = CountVectorizer(analyzer="word", stop_words="english")
    else:
        transformer = CountVectorizer(analyzer = "char", ngram_range=(1,2))
        
    count_textdata = transformer.fit_transform(x_text)
    freq_transfomer = TfidfTransformer()
    freq_textdata = freq_transfomer.fit_transform(count_textdata)
    features = freq_textdata

    scores = []
    for modelname in models:
        print(rep, modelname)
        meanf1 = train_eval(modelname, features, labels)
        scores.append(meanf1)

    results.append(pd.Series(scores, index=models))

df = pd.concat(results, axis=0, keys=representations)

character n-grams lr
1
2
3
4
5
6
7
8
9
10
character n-grams svm
1
2
3
4
5
6
7
8
9
10
character n-grams rf
1
2
3
4
5
6
7
8
9
10
character n-grams nb
1
2
3
4
5
6
7
8
9
10
word unigrams lr
1
2
3
4
5
6
7
8
9
10
word unigrams svm
1
2
3
4
5
6
7
8
9
10
word unigrams rf
1
2
3
4
5
6
7
8
9
10
word unigrams nb
1
2
3
4
5
6
7
8
9
10


In [31]:
df

character n-grams  lr     0.061991
                   svm    0.145293
                   rf     0.006040
                   nb     0.009103
word unigrams      lr     0.124307
                   svm    0.293884
                   rf     0.076190
                   nb     0.001183
dtype: float64

In [None]:
df.name = "F1_Formspring"
df.to_frame().to_csv("results_wiki.csv", index_label=["processing", "model"])