In [36]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [37]:
# читаем данные из гугл коллаба, можно просто поменять пути ниже

train_path = "/content/drive/MyDrive/Tochka_TZ/train.parquet"
test_path  = "/content/drive/MyDrive/Tochka_TZ/test.parquet"

train_df = pd.read_parquet(train_path)
test_df  = pd.read_parquet(test_path)

print(train_df.shape, test_df.shape)
print("train index name:", train_df.index.name)
print("test index name:", test_df.index.name)
train_df.head(2)

(15000, 7) (5000, 1)
train index name: uuid
test index name: uuid


Unnamed: 0_level_0,text,integrity,integrity_reasoning,factuality,factuality_reasoning,truthfulness,truthfulness_reasoning
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dff0d182-5434-46e2-9183-be278f66667f,"Соцветия ромашки, которые продаются в аптеках,...",1.0,The content is an informative text about the u...,1.0,The content provides informative and fact-base...,1.0,The content provides credible information abou...
8268f315-03db-4f12-aa46-0b968c3b1b19,Кто из черниговцев сам будет убирать придомову...,1.0,The content is an informative text about a dec...,1.0,The content is a coherent and informative news...,1.0,The content provides a detailed and credible a...


In [46]:
# пробегаемся по дисбалансу данных исключая 0.5

targets = ["integrity", "truthfulness", "factuality"]

for i in targets:
  y01 = train_df.loc[train_df[i] != 0.5, i].astype(int)
  print(y01.value_counts(normalize=True).sort_index())

integrity
0    0.237123
1    0.762877
Name: proportion, dtype: float64
truthfulness
0    0.099841
1    0.900159
Name: proportion, dtype: float64
factuality
0    0.297409
1    0.702591
Name: proportion, dtype: float64


In [62]:
# посчитаем F1 для случая, когда модель всего предсказывает 1

def f1_always_one(pos_rate):
    return 2 * pos_rate / (1 + pos_rate)

for t in targets:
    y = train_df[t].astype(float).values
    y01 = y[y != 0.5].astype(int)
    p = y01.mean()
    print(f"{t:12s} | F1(всегда 1) = {f1_always_one(p)}")

integrity    | F1(всегда 1) = 0.8654907141192012
truthfulness | F1(всегда 1) = 0.9474565626962529
factuality   | F1(всегда 1) = 0.8253195374315276


In [65]:
# при дисбалансе 0.5 почти никогда не оптимальный порог для F1,
# подберем порог для каждого таргета отдельно по валидации

def best_threshold(y_true, p):
    grid = np.linspace(0.05, 0.95, 19)
    best_th, best_f1 = 0.5, -1.0
    for th in grid:
        f1 = f1_score(y_true, (p >= th).astype(int))
        if f1 > best_f1:
            best_f1, best_th = f1, th
    return float(best_th), float(best_f1)


def fit_target(X, y_float, random_state=42):
    # Игнорируем 0.5
    mask = y_float != 0.5
    X2 = X[mask]
    y2 = y_float[mask].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(
        X2, y2, test_size=0.15, random_state=random_state, stratify=y2
    )

    clf = LogisticRegression(
        max_iter=3000,
        solver="liblinear",
        class_weight="balanced"  # помогает при дисбалансе классов
    )
    clf.fit(X_train, y_train)

    p_val = clf.predict_proba(X_val)[:, 1]
    th, f1 = best_threshold(y_val, p_val)

    # после подбора порога дообучаемся на всех доступных 0/1 примерах
    clf.fit(X2, y2)

    return clf, th, f1

In [66]:
# выбираю char_wb потому что ловит шаблонный web-мусор, повторяющиес фразы
# устойчивее к смешению языков и опечаткам
vectorizer = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=3,
    max_df=0.98
)

X = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

models = {}
thresholds = {}
val_f1s = {}

for t in targets:
    y = train_df[t].astype(float).values
    clf, th, f1 = fit_target(X, y, random_state=42)
    models[t] = clf
    thresholds[t] = th
    val_f1s[t] = f1

    print(f"{t:12s} | val F1={f1:.4f} | threshold={th:.2f}")

print("Mean val F1:", round(float(np.mean(list(val_f1s.values()))), 4))
print("Thresholds:", thresholds)

integrity     | val F1=0.8788 | threshold=0.30
truthfulness  | val F1=0.9599 | threshold=0.30
factuality    | val F1=0.8932 | threshold=0.40
Mean val F1: 0.9106
Thresholds: {'integrity': 0.3, 'truthfulness': 0.3, 'factuality': 0.39999999999999997}


In [67]:
# uuid лежит в индексе, значит берём test_df.index
submission = pd.DataFrame({
    "uuid": test_df.index.astype(str),
})

for t in targets:
    p = models[t].predict_proba(X_test)[:, 1]
    submission[t] = (p >= thresholds[t]).astype(int)

submission = submission[["uuid", "integrity", "truthfulness", "factuality"]]
submission.to_csv("submission.csv", index=False)

submission.head()

Unnamed: 0,uuid,integrity,truthfulness,factuality
0,61f42863-9b53-4e78-a952-714da49f0c7a,1,1,1
1,f07bb53f-5639-46fe-9741-9465d516b8d4,1,1,1
2,f1d3d72b-3432-4604-8146-15a7d94b1598,1,1,1
3,935d8a6c-6d75-4321-a66d-f3061d267df7,0,0,0
4,c7f7f912-59f6-4db3-abc3-510f9318a3ff,1,1,1
