In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("dataset_news_cut_ready_v2.csv")

In [3]:
df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [4]:
df.head(1)

Unnamed: 0,title,text,target,text_cnn,text_linear,title_cnn,title_linear
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0,donald trump just couldn t wish all americans ...,donald trump wish americans happy new year lea...,donald trump sends out embarrassing new year’...,donald trump sends embarrassing new year eve m...


In [5]:
df.dropna(how="any", inplace=True)

In [6]:
print(df.isna().sum())
print(df.isnull().sum())

title           0
text            0
target          0
text_cnn        0
text_linear     0
title_cnn       0
title_linear    0
dtype: int64
title           0
text            0
target          0
text_cnn        0
text_linear     0
title_cnn       0
title_linear    0
dtype: int64


In [7]:
print("Пустые строки по столбцам:")
for col in ['title_linear', 'text_linear', 'title_cnn', 'text_cnn']:
    empty_count = (df[col] == '').sum()
    print(f"{col}: {empty_count}")
print()

Пустые строки по столбцам:
title_linear: 0
text_linear: 0
title_cnn: 0
text_cnn: 0



In [8]:
#делим на выборки
X_temp, X_test, y_temp, y_test = train_test_split(
    df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42, stratify=df['target']
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp # 0.25 * 0.8 = 0.2
)

In [9]:
print(f"Размеры выборок: Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Размеры выборок: Train: (26508, 6), Val: (8837, 6), Test: (8837, 6)


In [10]:
#Линейные модели
train_texts_linear = (X_train['title_linear'] + " " + X_train['text_linear']).tolist()
val_texts_linear = (X_val['title_linear'] + " " + X_val['text_linear']).tolist()
test_texts_linear = (X_test['title_linear'] + " " + X_test['text_linear']).tolist()

In [11]:
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(train_texts_linear)
X_val_tfidf = vectorizer.transform(val_texts_linear)
X_test_tfidf = vectorizer.transform(test_texts_linear)

In [12]:
# Обучение LogisticRegression
lr_model = LogisticRegression(C=1.0, random_state=42, n_jobs=-1, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_val_pred = lr_model.predict_proba(X_val_tfidf)[:, 1] # Берем вероятность класса 1

In [13]:
# Обучение LinearSVM
svm_model = LinearSVC(C=1.0, random_state=42, max_iter=2000)
svm_model.fit(X_train_tfidf, y_train)
# SVM из sklearn не имеет predict_proba по умолчанию, используем decision_function и нормализуем
svm_val_dec = svm_model.decision_function(X_val_tfidf)
svm_val_pred = (svm_val_dec - svm_val_dec.min()) / (svm_val_dec.max() - svm_val_dec.min())

In [14]:
# Параметры для текстов
max_words = 50000
max_len_title = 30
max_len_text = 200

# Подготовка данных для CNN
train_title_cnn = X_train['title_cnn'].astype(str).tolist()
train_text_cnn = X_train['text_cnn'].astype(str).tolist()
val_title_cnn = X_val['title_cnn'].astype(str).tolist()
val_text_cnn = X_val['text_cnn'].astype(str).tolist()
test_title_cnn = X_test['title_cnn'].astype(str).tolist()
test_text_cnn = X_test['text_cnn'].astype(str).tolist()

# Токенизация
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
# Обучаем на объединенных данных заголовков и текстов для единого словаря
all_texts_cnn = train_title_cnn + train_text_cnn
tokenizer.fit_on_texts(all_texts_cnn)

# Преобразование в последовательности и паддинг
def preprocess_sequences(texts, maxlen):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

X_train_title_seq = preprocess_sequences(train_title_cnn, max_len_title)
X_train_text_seq = preprocess_sequences(train_text_cnn, max_len_text)
X_val_title_seq = preprocess_sequences(val_title_cnn, max_len_title)
X_val_text_seq = preprocess_sequences(val_text_cnn, max_len_text)
X_test_title_seq = preprocess_sequences(test_title_cnn, max_len_title)
X_test_text_seq = preprocess_sequences(test_text_cnn, max_len_text)

# Создание модели CNN с двумя входами
def create_cnn_model():
    # Входы
    input_title = Input(shape=(max_len_title,), name='title_input')
    input_text = Input(shape=(max_len_text,), name='text_input')

    embedding_layer = Embedding(input_dim=max_words, output_dim=128)

    title_embedded = embedding_layer(input_title)
    text_embedded = embedding_layer(input_text)

    title_conv = Conv1D(64, 3, activation='relu')(title_embedded)
    title_pool = GlobalMaxPooling1D()(title_conv)

    text_conv = Conv1D(64, 5, activation='relu')(text_embedded)
    text_pool = GlobalMaxPooling1D()(text_conv)

    concatenated = Concatenate()([title_pool, text_pool])

    dropout1 = Dropout(0.3)(concatenated)
    dense = Dense(64, activation='relu')(dropout1)
    dropout2 = Dropout(0.5)(dense)
    output = Dense(1, activation='sigmoid')(dropout2)

    model = Model(inputs=[input_title, input_text], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

cnn_model = create_cnn_model()
# cnn_model.summary() # Раскомментируйте, чтобы посмотреть архитектуру

# обучение CNN
history = cnn_model.fit(
    [X_train_title_seq, X_train_text_seq],
    y_train,
    batch_size=128,
    epochs=5, # Количество эпох можно увеличить
    validation_data=([X_val_title_seq, X_val_text_seq], y_val),
    verbose=1
)

# получаем предсказания CNN на валидации
cnn_val_pred = cnn_model.predict([X_val_title_seq, X_val_text_seq]).flatten()

Epoch 1/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 34ms/step - accuracy: 0.7900 - loss: 0.4116 - val_accuracy: 0.9890 - val_loss: 0.0338
Epoch 2/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9934 - loss: 0.0216 - val_accuracy: 0.9930 - val_loss: 0.0223
Epoch 3/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.9991 - loss: 0.0044 - val_accuracy: 0.9929 - val_loss: 0.0261
Epoch 4/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9996 - loss: 0.0020 - val_accuracy: 0.9926 - val_loss: 0.0256
Epoch 5/5
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9997 - loss: 0.0014 - val_accuracy: 0.9932 - val_loss: 0.0266
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [15]:
# Собираем предсказания всех моделей на валидационной выборке в одну матрицу
val_predictions = np.column_stack([lr_val_pred, svm_val_pred, cnn_val_pred])

# Функция для взвешенного голосования, которую будем оптимизировать
def weighted_accuracy(weights):
    # Нормализуем веса, чтобы их сумма была равна 1
    final_weights = weights / np.sum(weights)
    # Взвешенное среднее предсказаний
    weighted_ensemble_pred = np.average(val_predictions, axis=1, weights=final_weights)
    # Преобразуем вероятности в бинарные предсказания (0 или 1)
    binary_predictions = (weighted_ensemble_pred > 0.5).astype(int)
    # Целевая метрика: 1 - accuracy (чтобы минимизировать ошибку)
    return 1 - accuracy_score(y_val, binary_predictions)

# Начальные веса (равномерное распределение)
initial_weights = np.array([1.0, 1.0, 1.0])

# Ограничения: веса должны быть неотрицательными
bounds = [(0, None)] * 3

# Оптимизация весов
result = minimize(weighted_accuracy, initial_weights, method='SLSQP', bounds=bounds)

# подбор весов
optimal_weights = result['x'] / np.sum(result['x'])
print(f"\nОптимальные веса для ансамбля")
print(f"LogisticRegression: {optimal_weights[0]:.4f}")
print(f"LinearSVM: {optimal_weights[1]:.4f}")
print(f"CNN: {optimal_weights[2]:.4f}")

# Получаем предсказания всех моделей на тестовой выборке
lr_test_pred = lr_model.predict_proba(X_test_tfidf)[:, 1]
svm_test_dec = svm_model.decision_function(X_test_tfidf)
svm_test_pred = (svm_test_dec - svm_test_dec.min()) / (svm_test_dec.max() - svm_test_dec.min())
cnn_test_pred = cnn_model.predict([X_test_title_seq, X_test_text_seq]).flatten()

# Собираем тестовые предсказания
test_predictions = np.column_stack([lr_test_pred, svm_test_pred, cnn_test_pred])

# оптимальные веса для финального предсказания
final_ensemble_pred = np.average(test_predictions, axis=1, weights=optimal_weights)
final_ensemble_binary = (final_ensemble_pred > 0.5).astype(int)

# оценка качества
test_accuracy = accuracy_score(y_test, final_ensemble_binary)
test_f1 = f1_score(y_test, final_ensemble_binary)

print(f"\nРезультаты на тестовой выборке")
print(f"Точность ансамбля (Accuracy): {test_accuracy:.4f}")
print(f"F1-Score ансамбля: {test_f1:.4f}")

# сравнение с индивидуальными моделями
print(f"\nСравнение с индивидуальными моделями (Accuracy)")
print(f"LogisticRegression: {accuracy_score(y_test, (lr_test_pred > 0.5).astype(int)):.4f}")
print(f"LinearSVM: {accuracy_score(y_test, (svm_test_pred > 0.5).astype(int)):.4f}")
print(f"CNN: {accuracy_score(y_test, (cnn_test_pred > 0.5).astype(int)):.4f}")


Оптимальные веса для ансамбля
LogisticRegression: 0.3333
LinearSVM: 0.3333
CNN: 0.3333
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Результаты на тестовой выборке
Точность ансамбля (Accuracy): 0.9964
F1-Score ансамбля: 0.9963

Сравнение с индивидуальными моделями (Accuracy)
LogisticRegression: 0.9782
LinearSVM: 0.9745
CNN: 0.9932
