In [2]:
print("hello world")

hello world


In [3]:
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.ensemble  import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import preprocessor as p

In [4]:
models = [ 'svm', 'naive', 'lr', 'random_forest']
NO_OF_FOLDS = 10
MODEL_TYPE = "all"
HASH_REMOVE = None

In [5]:
def load_data(filename):
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    for i in range(len(data)):
        if(HASH_REMOVE):
            x_text.append(p.tokenize((data[i]['text']).encode('utf-8')))
        else:
            x_text.append(data[i]['text'])
        labels.append(data[i]['label'])
    return x_text,labels

def get_filename(dataset):
    global N_CLASS, HASH_REMOVE
    if(dataset=="twitter"):
        filename = "../data/twitter_data.pkl"
        N_CLASS = 3
        HASH_REMOVE = False
    elif(dataset=="formspring"):
        N_CLASS = 2
        filename = "../data/formspring_data.pkl"
        HASH_REMOVE = False
    elif(dataset=="wiki"):
        N_CLASS = 2
        filename = "../data/wiki_data.pkl"
        HASH_REMOVE = False
    return filename

In [6]:
def get_scores(y_true, y_pred):
#     if(data=="wiki"):
#         auc = roc_auc_score(y_true,y_pred)
#         print('Test ROC AUC: %.3f' %auc)
#     print(":: Confusion Matrix")
#     print(confusion_matrix(y_true, y_pred))
#     print(":: Classification Report")
#     print(classification_report(y_true, y_pred))
    return np.array([
            precision_score(y_true, y_pred, average=None),
            recall_score(y_true, y_pred,  average=None),
            f1_score(y_true, y_pred, average=None)])

def print_scores(scores):
    for i in range(N_CLASS):
        if(i!=0):
            print("Precision Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:, i].mean(), scores[:, i].std() * 2))
            print("Recall Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:,  N_CLASS+i].mean(), scores[:,N_CLASS+i].std() * 2))
            print("F1_score Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:, N_CLASS*2+i].mean(), scores[:,  N_CLASS*2+i].std() * 2))


In [7]:
def classification_model(X, Y, model_type):
    X, Y = shuffle(X, Y, random_state=42)
    print("Model Type:", model_type)
    kf = KFold(n_splits=NO_OF_FOLDS)
    scores = []
    for train_index, test_index in kf.split(X):
        Y = np.asarray(Y)
        model = get_model(model_type)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        curr_scores = get_scores(y_test, y_pred)
        scores.append(np.hstack(curr_scores))
    print_scores(np.array(scores))
    return scores

In [8]:
def get_model(m_type):
    if m_type == 'lr':
        logreg = LogisticRegression(class_weight="balanced")
    elif m_type == 'naive':
        logreg =  MultinomialNB()
    elif m_type == "random_forest":
        logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    elif m_type == "svm":
        logreg = LinearSVC(class_weight="balanced")
    else:
        print("ERROR: Please specify a correst model")
        return None
    return logreg

In [9]:
def train(x_text, labels, MODEL_TYPE):

    if(WORD):
        print("Using word based features")
        bow_transformer = CountVectorizer(analyzer="word",max_features = 10000,stop_words='english').fit(x_text)
        comments_bow = bow_transformer.transform(x_text)
        tfidf_transformer = TfidfTransformer(norm = 'l2').fit(comments_bow)
        comments_tfidf = tfidf_transformer.transform(comments_bow)
        features = comments_tfidf
    else:
        print("Using char n-grams based features")
        bow_transformer = CountVectorizer(max_features = 10000, ngram_range = (1,2)).fit(x_text)
        comments_bow = bow_transformer.transform(x_text)
        tfidf_transformer = TfidfTransformer(norm = 'l2').fit(comments_bow)
        comments_tfidf = tfidf_transformer.transform(comments_bow)
        features = comments_tfidf

    if(data == "twitter"):
        dict1 = {'racism':0,'sexism':1,'none':2}
        labels = np.array([dict1[b] for b in labels])

    from collections import Counter
    print(Counter(labels))

    allscores = []

    if(MODEL_TYPE != "all"):
        scores = classification_model(features, labels, MODEL_TYPE)
        allscores.append(scores)
    else:
        for model_type in models:
            scores = classification_model(features, labels, model_type)
            allscores.append(scores)
    return allscores

In [10]:
data = "formspring"
WORD =  False
x_text, labels = load_data(get_filename(data))
print ("Data loaded!")
formspring_ngram_scores = train(x_text, labels, MODEL_TYPE)

Data loaded!
Using char n-grams based features
Counter({0: 11997, 1: 776})
Model Type: svm
Precision Class 1 (avg): 0.466 (+/- 0.109)
Recall Class 1 (avg): 0.503 (+/- 0.122)
F1_score Class 1 (avg): 0.483 (+/- 0.104)
Model Type: naive
Precision Class 1 (avg): 0.850 (+/- 0.640)
Recall Class 1 (avg): 0.015 (+/- 0.015)
F1_score Class 1 (avg): 0.030 (+/- 0.028)
Model Type: lr


  _warn_prf(average, modifier, msg_start, len(result))


Precision Class 1 (avg): 0.410 (+/- 0.100)
Recall Class 1 (avg): 0.625 (+/- 0.135)
F1_score Class 1 (avg): 0.494 (+/- 0.106)
Model Type: random_forest
Precision Class 1 (avg): 0.777 (+/- 0.174)
Recall Class 1 (avg): 0.165 (+/- 0.073)
F1_score Class 1 (avg): 0.269 (+/- 0.100)


In [11]:
data = "formspring"
WORD = True
x_text, labels = load_data(get_filename(data))
print ("Data loaded!")
formspring_unigram_scores = train(x_text, labels, MODEL_TYPE)

Data loaded!
Using word based features
Counter({0: 11997, 1: 776})
Model Type: svm
Precision Class 1 (avg): 0.415 (+/- 0.089)
Recall Class 1 (avg): 0.525 (+/- 0.132)
F1_score Class 1 (avg): 0.463 (+/- 0.100)
Model Type: naive
Precision Class 1 (avg): 0.575 (+/- 0.950)
Recall Class 1 (avg): 0.013 (+/- 0.029)
F1_score Class 1 (avg): 0.025 (+/- 0.055)
Model Type: lr


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision Class 1 (avg): 0.407 (+/- 0.078)
Recall Class 1 (avg): 0.617 (+/- 0.127)
F1_score Class 1 (avg): 0.490 (+/- 0.083)
Model Type: random_forest
Precision Class 1 (avg): 0.733 (+/- 0.232)
Recall Class 1 (avg): 0.157 (+/- 0.070)
F1_score Class 1 (avg): 0.257 (+/- 0.102)


In [12]:
data = "twitter"
WORD = False
x_text, labels = load_data(get_filename(data))
print ("Data loaded!")
twitter_ngram_scores = train(x_text, labels, MODEL_TYPE)

Data loaded!
Using char n-grams based features
Counter({2: 11036, 1: 3117, 0: 1937})
Model Type: svm
Precision Class 1 (avg): 0.786 (+/- 0.055)
Recall Class 1 (avg): 0.736 (+/- 0.067)
F1_score Class 1 (avg): 0.759 (+/- 0.040)
Precision Class 2 (avg): 0.891 (+/- 0.026)
Recall Class 2 (avg): 0.896 (+/- 0.020)
F1_score Class 2 (avg): 0.894 (+/- 0.013)
Model Type: naive
Precision Class 1 (avg): 0.910 (+/- 0.056)
Recall Class 1 (avg): 0.455 (+/- 0.089)
F1_score Class 1 (avg): 0.605 (+/- 0.082)
Precision Class 2 (avg): 0.804 (+/- 0.025)
Recall Class 2 (avg): 0.963 (+/- 0.008)
F1_score Class 2 (avg): 0.876 (+/- 0.014)
Model Type: lr


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Precision Class 1 (avg): 0.738 (+/- 0.043)
Recall Class 1 (avg): 0.772 (+/- 0.075)
F1_score Class 1 (avg): 0.754 (+/- 0.043)
Precision Class 2 (avg): 0.910 (+/- 0.027)
Recall Class 2 (avg): 0.851 (+/- 0.018)
F1_score Class 2 (avg): 0.879 (+/- 0.012)
Model Type: random_forest
Precision Class 1 (avg): 0.893 (+/- 0.041)
Recall Class 1 (avg): 0.557 (+/- 0.085)
F1_score Class 1 (avg): 0.686 (+/- 0.073)
Precision Class 2 (avg): 0.842 (+/- 0.033)
Recall Class 2 (avg): 0.949 (+/- 0.009)
F1_score Class 2 (avg): 0.892 (+/- 0.019)


In [13]:
data = "twitter"
WORD = True
x_text, labels = load_data(get_filename(data))
print ("Data loaded!")
twitter_unigram_scores = train(x_text, labels, MODEL_TYPE)

Data loaded!
Using word based features
Counter({2: 11036, 1: 3117, 0: 1937})
Model Type: svm
Precision Class 1 (avg): 0.803 (+/- 0.044)
Recall Class 1 (avg): 0.744 (+/- 0.052)
F1_score Class 1 (avg): 0.772 (+/- 0.037)
Precision Class 2 (avg): 0.893 (+/- 0.023)
Recall Class 2 (avg): 0.901 (+/- 0.018)
F1_score Class 2 (avg): 0.897 (+/- 0.009)
Model Type: naive
Precision Class 1 (avg): 0.904 (+/- 0.035)
Recall Class 1 (avg): 0.469 (+/- 0.056)
F1_score Class 1 (avg): 0.617 (+/- 0.051)
Precision Class 2 (avg): 0.806 (+/- 0.022)
Recall Class 2 (avg): 0.963 (+/- 0.007)
F1_score Class 2 (avg): 0.877 (+/- 0.011)
Model Type: lr


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Precision Class 1 (avg): 0.752 (+/- 0.045)
Recall Class 1 (avg): 0.785 (+/- 0.070)
F1_score Class 1 (avg): 0.768 (+/- 0.049)
Precision Class 2 (avg): 0.913 (+/- 0.024)
Recall Class 2 (avg): 0.860 (+/- 0.015)
F1_score Class 2 (avg): 0.886 (+/- 0.008)
Model Type: random_forest
Precision Class 1 (avg): 0.869 (+/- 0.037)
Recall Class 1 (avg): 0.648 (+/- 0.077)
F1_score Class 1 (avg): 0.742 (+/- 0.058)
Precision Class 2 (avg): 0.868 (+/- 0.029)
Recall Class 2 (avg): 0.935 (+/- 0.009)
F1_score Class 2 (avg): 0.900 (+/- 0.014)


In [14]:
data = "wiki"
WORD = False
x_text, labels = load_data(get_filename(data))
print ("Data loaded!")
wiki_ngram_scores = train(x_text, labels, MODEL_TYPE)

Data loaded!
Using char n-grams based features
Counter({0: 102274, 1: 13590})
Model Type: svm
Precision Class 1 (avg): 0.591 (+/- 0.025)
Recall Class 1 (avg): 0.823 (+/- 0.020)
F1_score Class 1 (avg): 0.688 (+/- 0.017)
Model Type: naive
Precision Class 1 (avg): 0.839 (+/- 0.010)
Recall Class 1 (avg): 0.554 (+/- 0.028)
F1_score Class 1 (avg): 0.667 (+/- 0.021)
Model Type: lr


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Precision Class 1 (avg): 0.602 (+/- 0.023)
Recall Class 1 (avg): 0.845 (+/- 0.022)
F1_score Class 1 (avg): 0.703 (+/- 0.017)
Model Type: random_forest
Precision Class 1 (avg): 0.886 (+/- 0.013)
Recall Class 1 (avg): 0.548 (+/- 0.035)
F1_score Class 1 (avg): 0.677 (+/- 0.027)


In [15]:
data = "wiki"
WORD = True
x_text, labels = load_data(get_filename(data))
print ("Data loaded!")
wiki_unigram_scores = train(x_text, labels, MODEL_TYPE)

Data loaded!
Using word based features
Counter({0: 102274, 1: 13590})
Model Type: svm
Precision Class 1 (avg): 0.590 (+/- 0.022)
Recall Class 1 (avg): 0.818 (+/- 0.026)
F1_score Class 1 (avg): 0.686 (+/- 0.019)
Model Type: naive
Precision Class 1 (avg): 0.899 (+/- 0.016)
Recall Class 1 (avg): 0.522 (+/- 0.036)
F1_score Class 1 (avg): 0.660 (+/- 0.027)
Model Type: lr


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision Class 1 (avg): 0.620 (+/- 0.027)
Recall Class 1 (avg): 0.834 (+/- 0.024)
F1_score Class 1 (avg): 0.711 (+/- 0.021)
Model Type: random_forest
Precision Class 1 (avg): 0.811 (+/- 0.026)
Recall Class 1 (avg): 0.663 (+/- 0.022)
F1_score Class 1 (avg): 0.729 (+/- 0.021)


In [47]:
list_of_results = []
for res in [formspring_ngram_scores, formspring_unigram_scores,
            twitter_ngram_scores, twitter_unigram_scores,
            wiki_ngram_scores, wiki_unigram_scores]:
    results = {}
    for model_n, model_results in enumerate(res):
        model_name = models[model_n]
        if len(model_results[0]) == 6:
            results[model_name] = np.array(model_results)[:, 5].mean()  # == N_CLASS (2) * 2 + 1
        else:
            results[model_name + "_racism"] = np.array(model_results)[:, 7].mean() # == N_CLASS (3)*2 + 1
            results[model_name + "_sexism"] = np.array(model_results)[:, 8].mean() # == N_CLASS (3)*2 + 2
    list_of_results.append(results)

In [48]:
# getting things into shape
import pandas as pd
result = pd.DataFrame(list_of_results).stack()
index = [("Formspring", "Character n-grams", "SVM", "Bully"),
         ("Formspring", "Character n-grams", "NB", "Bully"),
         ("Formspring", "Character n-grams", "LR", "Bully"),
         ("Formspring", "Character n-grams", "RF", "Bully"),
         ("Formspring", "Word unigrams", "SVM", "Bully"),
         ("Formspring", "Word unigrams", "NB", "Bully"),
         ("Formspring", "Word unigrams", "LR", "Bully"),
         ("Formspring", "Word unigrams", "RF", "Bully"),
         ("Twitter", "Character n-grams", "SVM", "Racism"),
         ("Twitter", "Character n-grams", "NB", "Racism"),
         ("Twitter", "Character n-grams", "LR", "Racism"),
         ("Twitter", "Character n-grams", "RF", "Racism"),
         ("Twitter", "Word unigrams", "SVM", "Racism"),
         ("Twitter", "Word unigrams", "NB", "Racism"),
         ("Twitter", "Word unigrams", "LR", "Racism"),
         ("Twitter", "Word unigrams", "RF", "Racism"),
         ("Twitter", "Character n-grams", "SVM", "Sexism"),
         ("Twitter", "Character n-grams", "NB", "Sexism"),
         ("Twitter", "Character n-grams", "LR", "Sexism"),
         ("Twitter", "Character n-grams", "RF", "Sexism"),
         ("Twitter", "Word unigrams", "SVM", "Sexism"),
         ("Twitter", "Word unigrams", "NB", "Sexism"),
         ("Twitter", "Word unigrams", "LR", "Sexism"),
         ("Twitter", "Word unigrams", "RF", "Sexism"),
         ("Wiki", "Character n-grams", "SVM", "Attack"),
         ("Wiki", "Character n-grams", "NB", "Attack"),
         ("Wiki", "Character n-grams", "LR", "Attack"),
         ("Wiki", "Character n-grams", "RF", "Attack"),
         ("Wiki", "Word unigrams", "SVM", "Attack"),
         ("Wiki", "Word unigrams", "NB", "Attack"),
         ("Wiki", "Word unigrams", "LR", "Attack"),
         ("Wiki", "Word unigrams", "RF", "Attack")]
newindex = pd.MultiIndex.from_tuples(index)
result.index = newindex
idx = pd.IndexSlice
formatted_result = result.unstack(level = [1,2]).loc[:,idx[["Character n-grams", "Word unigrams"],["LR", "SVM", "RF", "NB"]]]

In [49]:
formatted_result.to_csv("tradml_results_with_their_code_and_data.csv", index_label=["Dataset", "Label"])

In [50]:
test = pd.read_csv("tradml_results_with_their_code_and_data.csv", header=[0,1], index_col=[0,1])
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Character n-grams,Character n-grams,Character n-grams,Character n-grams,Word unigrams,Word unigrams,Word unigrams,Word unigrams
Unnamed: 0_level_1,Unnamed: 1_level_1,LR,SVM,RF,NB,LR,SVM,RF,NB
Dataset,Label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Formspring,Bully,0.493913,0.482654,0.269361,0.029881,0.489553,0.462755,0.25693,0.025116
Twitter,Racism,0.60519,0.759149,0.87647,0.893575,0.685532,0.753991,0.892127,0.879111
Twitter,Sexism,0.617423,0.771979,0.877226,0.896889,0.741574,0.767808,0.900223,0.885721
Wiki,Attack,0.703035,0.68753,0.676726,0.667236,0.710819,0.68577,0.72937,0.659835
