In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [3]:
import string

def load_data():
    filename = "../data/formspring_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [4]:
x_text, labels = load_data()

Loading data from file: ../data/formspring_data_fixed.pkl


In [5]:
representations = ["character n-grams", "word unigrams"]
models = ["lr", "svm", "rf", "nb"]
n_folds = 10

In [6]:
def get_model(m_type):
    if m_type == 'lr':
        logreg = LogisticRegression(class_weight="balanced")
    elif m_type == 'nb':
        logreg =  MultinomialNB()
    elif m_type == "rf":
        logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    elif m_type == "svm":
        logreg = LinearSVC(class_weight="balanced")
    else:
        print("ERROR: Please specify a correst model")
        return None
    return logreg

In [12]:
def get_scores(y_true, y_pred):
#     if(data=="wiki"):
#         auc = roc_auc_score(y_true,y_pred)
#         print('Test ROC AUC: %.3f' %auc)
#     print(":: Confusion Matrix")
#     print(confusion_matrix(y_true, y_pred))
#     print(":: Classification Report")
#     print(classification_report(y_true, y_pred))
    return np.array([
            precision_score(y_true, y_pred, average=None),
            recall_score(y_true, y_pred,  average=None),
            f1_score(y_true, y_pred, average=None)])

def print_scores(scores):
    for i in range(N_CLASS):
        if(i!=0):
            print("Precision Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:, i].mean(), scores[:, i].std() * 2))
            print("Recall Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:,  N_CLASS+i].mean(), scores[:,N_CLASS+i].std() * 2))
            print("F1_score Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:, N_CLASS*2+i].mean(), scores[:,  N_CLASS*2+i].std() * 2))


In [13]:
def train_eval(modelname, features, labels):
    model = get_model(modelname)
    kfoldcv = KFold(n_splits = n_folds)
    scores = []
    n = 0
    for train_index, test_index in kfoldcv.split(features):
        n += 1
        print(n)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = np.array(labels)[train_index], np.array(labels)[test_index]
        model.fit(X=X_train, y=y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_true, y_pred, average=None)
        scores.append(f1)
    return np.mean(scores)

In [11]:
results = []

for rep in representations:
    if rep == "word unigrams":
        transformer = CountVectorizer(analyzer="word", stop_words="english")
    else:
        transformer = CountVectorizer(analyzer = "char", ngram_range=(1,2))
        
    count_textdata = transformer.fit_transform(x_text)
    freq_transfomer = TfidfTransformer()
    freq_textdata = freq_transfomer.fit_transform(count_textdata)
    features = freq_textdata

    scores = []
    for modelname in models:
        print(rep, modelname)
        meanf1 = train_eval(modelname, features, labels)
        scores.append(meanf1)

    results.append(pd.Series(scores, index=models))

df = pd.concat(results, axis=0, keys=representations)

character n-grams lr
1
0.5186567164179106
2
0.13740458015267176
3
0.378125
4
0.2877192982456141
5
0.06217616580310881
6
0.23448275862068968
7
0.21818181818181817
8
0.3186440677966102
9
0.175
10
0.10457516339869281
character n-grams svm
1
0.5352112676056338
2
0.140625
3
0.39672131147540984
4
0.29739776951672864
5
0.053763440860215055
6
0.22535211267605632
7
0.21021021021021022
8
0.30136986301369867
9
0.19909502262443438
10
0.08536585365853658
character n-grams rf
1
0.0
2
0.0
3
0.011904761904761906
4
0.034782608695652174
5
0.0
6
0.0
7
0.0
8
0.031746031746031744
9
0.0
10
0.0
character n-grams nb
1
0.0
2
0.0
3
0.05050505050505051
4
0.033333333333333326
5
0.0
6
0.0
7
0.0
8
0.0
9
0.0
10
0.0
word unigrams lr
1
0.7032967032967032
2
0.2448979591836735
3
0.512396694214876
4
0.44816053511705684
5
0.057971014492753624
6
0.4067796610169491
7
0.3383084577114428
8
0.5652173913043479
9
0.34615384615384615
10
0.21818181818181817
word unigrams svm
1
0.6624203821656052
2
0.15789473684210525
3
0.437086092

In [9]:
df

character n-grams  lr     0.243497
                   svm    0.244511
                   rf     0.009537
                   nb     0.008384
word unigrams      lr     0.384136
                   svm    0.353087
                   rf     0.086623
                   nb     0.001170
dtype: float64

In [None]:
df.name = "F1_Formspring"
df.to_frame().to_csv("results_formspring.csv", index_label=["processing", "model"])