In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
data = pd.read_csv("data.csv")
data

Unnamed: 0,text,chad,dance,dank,funny,horny,monka,pain,pog,sage,weird,wholesome
0,ez clap pepelaugh trueing widepeeposad peepohe...,0,0,0,0,0,0,1,0,1,0,0
1,ez omegalul docarrive peepoglad catkiss pogu e...,1,0,0,0,0,0,0,0,0,0,0
2,poggers pogu pepelaugh pagman pogu pagchomp po...,0,0,0,0,0,0,0,1,0,0,0
3,modcheck peepers peepers omegalul d: d: peeper...,0,0,0,0,0,0,0,0,1,0,0
4,dankies pausechamp tensesmash man pepega pepeg...,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1717,weirdchamp bboomer weirdge ratjam weirdchamp p...,0,0,1,0,0,0,0,0,0,1,0
1718,weirdge docarrive weirdchamp d: weirdchamp pai...,0,0,1,0,0,0,0,0,0,1,0
1719,weirdchamp xqcditch wtff kkonaw weirdge sadge ...,0,0,1,0,0,0,0,0,0,1,0
1720,wtff waytoodank weirdchamp wtff wtff hyperclap...,0,0,1,0,0,0,0,0,0,1,0


In [2]:
label_cols = data.columns[2:]
def get_class_weight(data):
    class_weight = {}
    for num, col in enumerate(label_cols):
        if num not in class_weight:
            class_weight[col] = round((data[data[col] == 1][col].sum())/data.shape[0]*100,2)
    return class_weight
class_weight = get_class_weight(data)
print("Class weight total:", sum(class_weight.values()), "%\n\n", class_weight)

Class weight total: 120.8 %

 {'dance': 12.08, 'dank': 12.08, 'funny': 12.08, 'horny': 12.08, 'monka': 12.08, 'pain': 12.08, 'pog': 12.08, 'sage': 12.08, 'weird': 12.08, 'wholesome': 12.08}


In [3]:
X_data = data["text"]
y_data = data.loc[::, data.columns != "text"]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=.25, train_size=.75)

In [4]:
X_train.shape

(1291,)

In [7]:
def train_classifier(X_train, y_train, C, reg):
    model = OneVsRestClassifier(LogisticRegression(penalty=reg, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

In [8]:
def tfidf_features(X_train, X_test):
    tfidf = TfidfVectorizer()
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)
    return X_train_tfidf, X_test_tfidf, tfidf.vocabulary_

X_train_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_test)
tfidf_reversed = {i:word for word,i in tfidf_vocab.items()}


In [9]:

tfidf_classifier = train_classifier(X_train_tfidf, y_train, C=4, reg="l2")
y_pred_labels = tfidf_classifier.predict(X_test_tfidf)
y_pred_scores = tfidf_classifier.decision_function(X_test_tfidf)

def print_eval_scores(y_test, y_pred_labels):
    print(f"Accuracy: {accuracy_score(y_test, y_pred_labels)}")
    print(f"F1-score macro: {f1_score(y_test, y_pred_labels, average='macro')}")
    print(f"F1-score micro: {f1_score(y_test, y_pred_labels, average='micro')}")
    print(f"F1-score weighted: {f1_score(y_test, y_pred_labels, average='weighted')}")
    print(f"Precision macro: {average_precision_score(y_test, y_pred_labels, average='macro')}")
    print(f"Precision micro: {average_precision_score(y_test, y_pred_labels, average='micro')}")
    print(f"Precision weighted: {average_precision_score(y_test, y_pred_labels, average='weighted')}")

print_eval_scores(y_test, y_pred_labels)

Accuracy: 0.9767981438515081
F1-score macro: 0.9903671112182754
F1-score micro: 0.9903593339176162
F1-score weighted: 0.9902616094943085
Precision macro: 0.983426681879831
Precision micro: 0.9832229633926269
Precision weighted: 0.9832713005929362


In [10]:
test_preds = tfidf_classifier.predict(X_test_tfidf)
def get_pred_labels(data, y_pred):
    label_cols = list(data.columns[1:])
    label_dict = {}
    for k,v in enumerate(label_cols):
        label_dict[k] = v
    preds = []
    for pred in y_pred:
        label_pred = []
        
        for index, label in enumerate(list(pred)):
            if label != 0:
                label = label_dict[index]
            label_pred.append(label)
        preds.append(tuple([i for i in label_pred if i != 0]))
    return preds

test_pred_labels = get_pred_labels(data, test_preds)
test_labels = get_pred_labels(data, y_test.to_numpy())
for i in range(0, len(test_labels)):
    print(f"\ny_label: {test_labels[i]}, \ny_pred: {test_pred_labels[i]}")


y_label: ('pain', 'sage'), 
y_pred: ('pain', 'sage')

y_label: ('dance', 'pain'), 
y_pred: ('dance', 'pain')

y_label: ('dance', 'weird'), 
y_pred: ('dance', 'weird')

y_label: ('chad', 'sage'), 
y_pred: ('chad', 'sage')

y_label: ('pog', 'sage'), 
y_pred: ('pog', 'sage')

y_label: ('funny',), 
y_pred: ('funny',)

y_label: ('funny',), 
y_pred: ('funny',)

y_label: ('funny', 'wholesome'), 
y_pred: ('funny', 'wholesome')

y_label: ('funny',), 
y_pred: ('funny',)

y_label: ('pog',), 
y_pred: ('pog',)

y_label: ('horny', 'pain'), 
y_pred: ('horny', 'pain')

y_label: ('pain', 'pog'), 
y_pred: ('pain', 'pog')

y_label: ('dank',), 
y_pred: ('dank',)

y_label: ('horny', 'monka'), 
y_pred: ('horny', 'monka')

y_label: ('dank', 'funny'), 
y_pred: ('dank', 'funny')

y_label: ('dance', 'pog'), 
y_pred: ('dance', 'pog')

y_label: ('monka',), 
y_pred: ('monka',)

y_label: ('weird',), 
y_pred: ('weird',)

y_label: ('monka',), 
y_pred: ('monka',)

y_label: ('pain',), 
y_pred: ('pain',)

y_label: ('da