In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

path = Path("../public_data/")
train_df = pd.read_csv(path / "train" / "track_a" / "sun.csv")
test_df = pd.read_csv(path / "dev" / "track_a" / "sun_a.csv")

train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [2]:
labels = sorted(train_df.columns[2:])
labels

['Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from scipy.sparse import vstack
import numpy as np

models = {}
features = {"BoW": CountVectorizer, "TFIDF": TfidfVectorizer}

for feature_name, feature in zip(features.keys(), features.values()):
    vectorizer = feature(tokenizer=word_tokenize, token_pattern=None)
    vectorizer.fit(train_df.text)

    X_train = vectorizer.transform(train_df.text)
    X_valid = vectorizer.transform(valid_df.text)

    y_train, y_valid = train_df[labels], valid_df[labels]

    classifiers = {
        "nb": MultinomialNB(),
        "svm": SVC(),
        "lr": LogisticRegression(),
    }

    param_grids = {
        "nb": {"estimator__alpha": np.linspace(0.001, 1, 50)},
        "svm": {"estimator__C": [0.01, 0.1, 1, 10, 100], "estimator__kernel": ["rbf", "linear"]},
        "lr": {"estimator__C": [0.01, 0.1, 1, 10, 100]},
    }

    for c in classifiers:
        X = vstack([X_train, X_valid])
        y = vstack([y_train, y_valid])

        ps = PredefinedSplit([-1] * len(y_train) + [0] * len(y_valid))
        clf = GridSearchCV(OneVsRestClassifier(classifiers[c]), param_grids[c], cv=ps, n_jobs=-1)
        clf.fit(X, y)

        y_pred = clf.predict(X_valid)
        f1 = f1_score(y_valid, y_pred, average="macro")

        models[f"{c}_{feature_name}"] = {
            "model": clf,
            "vectorizer": vectorizer,
            "f1": f1,
        }

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [4]:
best_model_name = max(models, key=lambda x: models[x]["f1"])
best_model_dict = models[best_model_name]
best_model, best_vectorizer = best_model_dict["model"], best_model_dict["vectorizer"]

In [5]:
X_test = best_vectorizer.transform(test_df.text)
y_pred = best_model.predict(X_test)

In [10]:
test_df[labels] = y_pred.toarray()

In [11]:
test_df

Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise
0,sun_dev_track_a_00001,"Aa aa salam kenal nya,ti abi 🙋 sumpah kot pise...",0,0,0,1,0,0
1,sun_dev_track_a_00002,"#Fauzi88Chanel hadir mang ti bogor kulon,Leuwi...",0,0,0,1,0,0
2,sun_dev_track_a_00003,Aslina ngakak kang. Teruskeun part salanjutna 🔥🔥🔥,0,0,0,1,0,0
3,sun_dev_track_a_00004,"Artis,artis naon nu resep lauk? Jawab:kucinta ...",0,0,0,1,0,0
4,sun_dev_track_a_00005,Awas mang tong hilap nya ka Bogor. Mun hante u...,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
194,sun_dev_track_a_00195,Alah siah mantep kieu ditunggu kolab nya bang,0,0,0,1,0,0
195,sun_dev_track_a_00196,Apa cuma urang nu ningali video ieu kalah ge s...,0,0,0,1,0,0
196,sun_dev_track_a_00197,"Aduhh gamon ini mah,jadi ingat ka mantan, azmy...",0,0,0,1,0,0
197,sun_dev_track_a_00198,Aduh aduh masa Allah NU d antos antos a dana s...,0,0,0,1,0,0


In [12]:
test_df.drop(["text"], axis=1).to_csv("pred_sun_a.csv", index=False)

In [13]:
!rm pred_sun_a.csv.zip
!zip pred_sun_a.csv.zip pred_sun_a.csv

rm: pred_sun_a.csv.zip: No such file or directory
  adding: pred_sun_a.csv (deflated 91%)
