In [1]:
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
import seaborn as sns
from numpy.typing import NDArray
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,title,text,label,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,Regional African body says ready to work close...,JOHANNESBURG (Reuters) - The Southern African ...,1,,,,,,,...,,,,,,,,,,
1,2,New Report Says Trump Destroyed Scaramucci’s ...,Donald Trump s new pick for Communications Dir...,0,,,,,,,...,,,,,,,,,,
2,3,Hamas chief in Gaza says Palestinian unity dea...,GAZA (Reuters) - Palestinian Islamist group Ha...,1,,,,,,,...,,,,,,,,,,
3,4,Police fire tear gas at Congo opposition leade...,KINSHASA (Reuters) - Police fired tear gas to ...,1,,,,,,,...,,,,,,,,,,
4,5,PRESIDENT TRUMP Hits Back At Activist Judge On...,Judge Orrick in California ruled against Presi...,0,,,,,,,...,,,,,,,,,,


In [3]:
test_df = pd.read_csv('train.csv')
test_df.head()

Unnamed: 0,id,title,text,label,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,Regional African body says ready to work close...,JOHANNESBURG (Reuters) - The Southern African ...,1,,,,,,,...,,,,,,,,,,
1,2,New Report Says Trump Destroyed Scaramucci’s ...,Donald Trump s new pick for Communications Dir...,0,,,,,,,...,,,,,,,,,,
2,3,Hamas chief in Gaza says Palestinian unity dea...,GAZA (Reuters) - Palestinian Islamist group Ha...,1,,,,,,,...,,,,,,,,,,
3,4,Police fire tear gas at Congo opposition leade...,KINSHASA (Reuters) - Police fired tear gas to ...,1,,,,,,,...,,,,,,,,,,
4,5,PRESIDENT TRUMP Hits Back At Activist Judge On...,Judge Orrick in California ruled against Presi...,0,,,,,,,...,,,,,,,,,,


In [4]:
cat_df = train_df.select_dtypes(include=['object'])
num_df = train_df.select_dtypes(exclude=['object'])

In [5]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24360 entries, 0 to 24359
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           24360 non-null  int64  
 1   title        24360 non-null  object 
 2   text         24356 non-null  object 
 3   label        24348 non-null  object 
 4   Unnamed: 4   1 non-null      object 
 5   Unnamed: 5   1 non-null      object 
 6   Unnamed: 6   1 non-null      object 
 7   Unnamed: 7   1 non-null      object 
 8   Unnamed: 8   1 non-null      object 
 9   Unnamed: 9   1 non-null      object 
 10  Unnamed: 10  1 non-null      object 
 11  Unnamed: 11  1 non-null      object 
 12  Unnamed: 12  1 non-null      object 
 13  Unnamed: 13  1 non-null      object 
 14  Unnamed: 14  1 non-null      object 
 15  Unnamed: 15  1 non-null      object 
 16  Unnamed: 16  1 non-null      object 
 17  Unnamed: 17  1 non-null      object 
 18  Unnamed: 18  1 non-null      object 
 19  Unna

In [6]:
nan_df = (train_df.isnull().mean() * 100).reset_index()
nan_df.columns = ['column_name', 'percentage']
nan_df.sort_values('percentage', ascending=False, inplace=True)
nan_df.head(20)

Unnamed: 0,column_name,percentage
4,Unnamed: 4,99.995895
5,Unnamed: 5,99.995895
9,Unnamed: 9,99.995895
8,Unnamed: 8,99.995895
7,Unnamed: 7,99.995895
6,Unnamed: 6,99.995895
14,Unnamed: 14,99.995895
15,Unnamed: 15,99.995895
16,Unnamed: 16,99.995895
17,Unnamed: 17,99.995895


In [7]:
columns_to_drop = nan_df[nan_df['percentage'] > 90]['column_name']
train_df = train_df.drop(columns=columns_to_drop)
test_df = test_df.drop(columns=columns_to_drop)

In [8]:
cat_df = train_df.select_dtypes(include=['object'])
num_df = train_df.select_dtypes(exclude=['object'])

In [9]:
cat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24360 entries, 0 to 24359
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   24360 non-null  object
 1   text    24356 non-null  object
 2   label   24348 non-null  object
dtypes: object(3)
memory usage: 571.1+ KB


In [10]:
mode_value = train_df['text'].mode()[0]
train_df['text'] = train_df['text'].fillna(mode_value)
test_df['text'] = test_df['text'].fillna(mode_value)

In [11]:
train_df['label'] = train_df['label'].fillna(0)
test_df['label'] = test_df['label'].fillna(0)

In [12]:
num_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24360 entries, 0 to 24359
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      24360 non-null  int64
dtypes: int64(1)
memory usage: 190.4 KB


In [13]:
print("Пропуски в train_df:", train_df.isnull().sum())
print("Пропуски в test_df:", test_df.isnull().sum())

Пропуски в train_df: id       0
title    0
text     0
label    0
dtype: int64
Пропуски в test_df: id       0
title    0
text     0
label    0
dtype: int64


In [14]:
train_df.head()

Unnamed: 0,id,title,text,label
0,1,Regional African body says ready to work close...,JOHANNESBURG (Reuters) - The Southern African ...,1
1,2,New Report Says Trump Destroyed Scaramucci’s ...,Donald Trump s new pick for Communications Dir...,0
2,3,Hamas chief in Gaza says Palestinian unity dea...,GAZA (Reuters) - Palestinian Islamist group Ha...,1
3,4,Police fire tear gas at Congo opposition leade...,KINSHASA (Reuters) - Police fired tear gas to ...,1
4,5,PRESIDENT TRUMP Hits Back At Activist Judge On...,Judge Orrick in California ruled against Presi...,0


In [59]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import time
import joblib

# Загрузка данных NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Инициализация
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
fake_keywords = ['shock', 'urgent', 'breaking', 'secret', 'hoax', 'fake', 'scandal', 'conspiracy']

# Функция предобработки текста
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Удаление URL
    text = re.sub(r'[^a-z\s]', '', text)  # Удаление пунктуации и цифр
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2]
    return ' '.join(words)

# Функция очистки меток
def clean_label(label):
    if pd.isna(label):
        return 0
    try:
        return int(float(label))
    except (ValueError, TypeError):
        label = str(label).strip().lower()
        return 1 if label in ['1', 'true', 'real', 'yes'] else 0

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Предобработка данных
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('').apply(preprocess_text)
    df['title'] = df['title'].fillna('').apply(preprocess_text)
    df['combined'] = df['title'] + ' ' + df['text']
    df['text_length'] = df['text'].apply(len)
    df['title_length'] = df['title'].apply(len)
    df['word_count'] = df['combined'].apply(lambda x: len(x.split()))
    df['fake_keyword_count'] = df['combined'].apply(lambda x: sum(x.lower().count(kw) for kw in fake_keywords))

train_df['label'] = train_df['label'].apply(clean_label).astype('int8')

# Разделение данных
X = train_df[['combined', 'text_length', 'title_length', 'word_count', 'fake_keyword_count']]
y = train_df['label']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Определение препроцессора
preprocessor = ColumnTransformer(
    transformers=[
        ('text_tfidf', TfidfVectorizer(max_features=15000, ngram_range=(1, 3), stop_words='english'), 'combined'),
        ('num', 'passthrough', ['text_length', 'title_length', 'word_count', 'fake_keyword_count'])
    ])

# Создание пайплайна
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss',
        scale_pos_weight=sum(y_train == 0) / sum(y_train == 1)  # Балансировка классов
    ))
])

# Обучение модели
print("Обучение модели...")
start_time = time.time()
pipeline.fit(X_train, y_train)
print(f"Время обучения: {time.time() - start_time:.2f} сек")

# Оценка на валидационной выборке
valid_pred = pipeline.predict(X_valid)
print("\nClassification Report:")
print(classification_report(y_valid, valid_pred))
print(f"F1-score (Validation): {f1_score(y_valid, valid_pred, average='weighted'):.4f}")

# Анализ ошибок
errors = X_valid[valid_pred != y_valid].copy()
errors['true_label'] = y_valid[valid_pred != y_valid]
errors['predicted_label'] = valid_pred[valid_pred != y_valid]
print("\nПримеры ошибок (первые 5):")
print(errors[['combined', 'true_label', 'predicted_label']].head())
print("\nСтатистика ошибок:")
print(f"Всего ошибок: {len(errors)}")
print(f"Ошибки для класса 0 (фейк): {len(errors[errors['true_label'] == 0])}")
print(f"Ошибки для класса 1 (реальная): {len(errors[errors['true_label'] == 1])}")

# Предсказание на тестовых данных
test_pred = pipeline.predict(test_df[['combined', 'text_length', 'title_length', 'word_count', 'fake_keyword_count']])
results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
#results.to_csv('improved_predictions.csv', index=False)
print("\nПредсказания сохранены в improved_predictions.csv")


KeyboardInterrupt: 

In [32]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
import nltk
from time import time

# Загрузка данных NLTK
nltk.download('punkt', quiet=True)

# Функция предобработки текста
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # Удаление пунктуации
    return text

# Загрузка данных с правильными типами
try:
    train_df = pd.read_csv('train.csv', dtype={'id': 'int32', 'title': 'string', 'text': 'string'})
    test_df = pd.read_csv('test.csv', dtype={'id': 'int32', 'title': 'string', 'text': 'string'})
except UnicodeDecodeError:
    train_df = pd.read_csv('train.csv', dtype={'id': 'int32', 'title': 'string', 'text': 'string'}, encoding='latin1')
    test_df = pd.read_csv('test.csv', dtype={'id': 'int32', 'title': 'string', 'text': 'string'}, encoding='latin1')

# Предобработка меток
def clean_label(label):
    if pd.isna(label):
        return 0
    label = str(label).strip().lower()
    if label in ['1', 'true', 'real', 'yes']:
        return 1
    elif label in ['0', 'false', 'fake', 'no']:
        return 0
    return int(float(label)) if label.replace('.', '').isdigit() else 0

train_df['label'] = train_df['label'].apply(clean_label).astype('int8')

# Быстрая предобработка текста
train_df['combined'] = (train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')).apply(preprocess_text)
test_df['combined'] = (test_df['title'].fillna('') + ' ' + test_df['text'].fillna('')).apply(preprocess_text)

# Разделение данных
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['combined'],
    train_df['label'],
    test_size=0.2,
    random_state=42
)

# Оптимизированный пайплайн
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        stop_words='english'
    )),
    ('model', LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ))
])

# Обучение модели
start_time = time()
pipeline.fit(X_train, y_train)
print(f"Время обучения: {time() - start_time:.2f} сек")

# Оценка
valid_pred = pipeline.predict(X_valid)
print("\nClassification Report:")
print(classification_report(y_valid, valid_pred))
print(f"F1-score: {f1_score(y_valid, valid_pred, average='weighted'):.4f}")

# Предсказание
test_pred = pipeline.predict(test_df['combined'])
results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
#results.to_csv('optimized_predictions.csv', index=False)
print("\nПредсказания сохранены в optimized_predictions.csv")

Время обучения: 32.93 сек





Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2249
           1       0.99      0.98      0.99      2623

    accuracy                           0.99      4872
   macro avg       0.99      0.99      0.99      4872
weighted avg       0.99      0.99      0.99      4872

F1-score: 0.9854

Предсказания сохранены в optimized_predictions.csv




In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report
import nltk
from sklearn.pipeline import Pipeline

# Загрузка данных NLTK (однократно)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Функция предобработки текста
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # Удаление пунктуации
    return text

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Предобработка данных
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('').apply(preprocess_text)
    df['title'] = df['title'].fillna('').apply(preprocess_text)
    df['combined'] = df['title'] + ' ' + df['text']

# Преобразование меток
train_df['label'] = train_df['label'].apply(
    lambda x: 1 if str(x).strip().lower() in ['1', 'true', 'real', 'yes'] else 0
)

# Создание пайплайна
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 2),
        stop_words='english'
    )),
    ('model', LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        n_jobs=-1,
        verbose=-1  # Убираем вывод логов
    ))
])

# Разделение данных
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['combined'],
    train_df['label'],
    test_size=0.2,
    random_state=42
)

# Обучение
pipeline.fit(X_train, y_train)

# Оценка
valid_pred = pipeline.predict(X_valid)
print("Classification Report:")
print(classification_report(y_valid, valid_pred))
print(f"\nF1-score: {f1_score(y_valid, valid_pred, average='weighted'):.4f}")

# Предсказание
test_pred = pipeline.predict(test_df['combined'])
results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
#results.to_csv('final_predictions.csv', index=False)
print("\nПредсказания сохранены в final_predictions.csv")



Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2249
           1       0.99      0.98      0.99      2623

    accuracy                           0.99      4872
   macro avg       0.99      0.99      0.99      4872
weighted avg       0.99      0.99      0.99      4872


F1-score: 0.9860

Предсказания сохранены в final_predictions.csv




In [60]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import nltk
import os

# Решение проблемы с NLTK данными
def setup_nltk():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet', quiet=True)
    
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords', quiet=True)

# Вызываем функцию настройки NLTK
setup_nltk()

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Инициализация инструментов NLTK с обработкой ошибок
try:
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
except:
    # Если что-то пошло не так, используем упрощенную обработку
    lemmatizer = None
    stop_words = set()

# Улучшенная функция предобработки текста с защитой от ошибок
def enhanced_preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # Удаление пунктуации
    text = re.sub(r'\d+', '', text)      # Удаление чисел
    
    if lemmatizer is not None:
        try:
            tokens = word_tokenize(text)
            tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
            return ' '.join(tokens)
        except:
            pass
    
    # Возвращаем упрощенную версию, если возникли ошибки
    return ' '.join([word for word in text.split() if word not in stop_words])

# Функция добавления новых признаков
def add_features(df):
    df['text_len'] = df['text'].apply(len)
    df['title_len'] = df['title'].apply(len)
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['title_word_count'] = df['title'].apply(lambda x: len(x.split()))
    df['word_density'] = df['word_count'] / (df['text_len'] + 1e-6)  # Добавляем малое число для избежания деления на 0
    return df

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Предобработка данных
print("Начало предобработки текста...")
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('').apply(enhanced_preprocess)
    df['title'] = df['title'].fillna('').apply(enhanced_preprocess)
    df['combined'] = df['title'] + ' ' + df['text']
print("Предобработка текста завершена.")

# Добавление новых признаков
print("Добавление новых признаков...")
train_df = add_features(train_df)
test_df = add_features(test_df)

# Преобразование меток
train_df['label'] = train_df['label'].apply(
    lambda x: 1 if str(x).strip().lower() in ['1', 'true', 'real', 'yes'] else 0
)

# Создание пайплайна
print("Создание пайплайна...")
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(
            max_features=30000,
            ngram_range=(1, 3),
            min_df=3,
            max_df=0.9,
            sublinear_tf=True,
            analyzer='word'
        ), 'combined'),
        ('num', StandardScaler(), ['text_len', 'title_len', 'word_count', 'title_word_count', 'word_density'])
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMClassifier(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=9,
        num_leaves=31,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ))
])

# Разделение данных
print("Разделение данных...")
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df,
    train_df['label'],
    test_size=0.2,
    random_state=42,
    stratify=train_df['label']
)

# Обучение модели
print("Начало обучения модели...")
pipeline.fit(X_train, y_train)
print("Обучение завершено.")

# Оптимизация порога классификации
print("Оптимизация порога классификации...")
valid_probs = pipeline.predict_proba(X_valid)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, valid_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
best_threshold = thresholds[np.argmax(f1_scores)]

# Оценка модели
valid_pred = (valid_probs >= best_threshold).astype(int)
print("\nClassification Report:")
print(classification_report(y_valid, valid_pred))
print(f"\nBest F1-score: {f1_score(y_valid, valid_pred, average='weighted'):.4f}")

# Кросс-валидация
print("\nКросс-валидация...")
cv_scores = cross_val_score(pipeline, train_df, train_df['label'], 
                          cv=3, scoring='f1_weighted', n_jobs=-1)
print(f"Cross-validation F1: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

# Предсказание на тестовых данных
print("\nПредсказание на тестовых данных...")
test_probs = pipeline.predict_proba(test_df)[:, 1]
test_pred = (test_probs >= best_threshold).astype(int)

# Сохранение результатов
results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
#results.to_csv('final_predictions.csv', index=False)
print("\nФинальные предсказания сохранены в final_predictions.csv")

Начало предобработки текста...
Предобработка текста завершена.
Добавление новых признаков...
Создание пайплайна...
Разделение данных...
Начало обучения модели...
Обучение завершено.
Оптимизация порога классификации...

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2223
           1       0.99      0.98      0.99      2649

    accuracy                           0.98      4872
   macro avg       0.98      0.98      0.98      4872
weighted avg       0.98      0.98      0.98      4872


Best F1-score: 0.9838

Кросс-валидация...
Cross-validation F1: 0.9841 (±0.0023)

Предсказание на тестовых данных...

Финальные предсказания сохранены в final_predictions.csv


In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import time

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Функция очистки текста
def clean_text(text):
    return str(text).lower().strip()[:2000]  # Ограничение длины текста

# Функция очистки меток
def clean_label(label):
    try:
        label = str(label).strip()
        if label.lower() in ['0', 'false', 'fake', 'no']:
            return 0
        elif label.lower() in ['1', 'true', 'real', 'yes']:
            return 1
        return int(float(label))
    except:
        return 0  # По умолчанию считаем фейковой новостью

# Предобработка данных
train_df['text'] = train_df['text'].fillna('').apply(clean_text)
train_df['title'] = train_df['title'].fillna('').apply(clean_text)
train_df['label'] = train_df['label'].apply(clean_label)
test_df['text'] = test_df['text'].fillna('').apply(clean_text)
test_df['title'] = test_df['title'].fillna('').apply(clean_text)

# Объединение текстовых полей
train_df['combined'] = train_df['title'] + " " + train_df['text']
test_df['combined'] = test_df['title'] + " " + test_df['text']

# Векторизация текста
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df['combined'])
X_test = vectorizer.transform(test_df['combined'])
y_train = train_df['label'].values

# Разделение на тренировочную и валидационную выборки
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)

# Модель XGBoost с правильными параметрами
model = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=None  # Убрали проблемный параметр
)

# Обучение с ручной проверкой на валидации
print("Начало обучения XGBoost...")
start_time = time.time()
model.fit(X_train, y_train)
print(f"\nОбучение завершено за {(time.time() - start_time)/60:.1f} минут")

# Оценка на валидации
valid_pred = model.predict(X_valid)
print("\nF1-score на валидации:", f1_score(y_valid, valid_pred, average='weighted'))

model.fit(X_train, y_train)

# Предсказание
test_pred = model.predict(X_test)
results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
results.to_csv('predictions3.csv', index=False)
print("\nПредсказания сохранены в final_predictions.csv")

Начало обучения XGBoost...

Обучение завершено за 0.7 минут

F1-score на валидации: 0.9839979829970033

Предсказания сохранены в final_predictions.csv


In [61]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, classification_report
import nltk
from sklearn.pipeline import Pipeline

# Загрузка данных NLTK
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Функция предобработки текста
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # Удаление пунктуации
    return text

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Предобработка данных
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('').apply(preprocess_text)
    df['title'] = df['title'].fillna('').apply(preprocess_text)
    df['combined'] = df['title'] + ' ' + df['text']

# Преобразование меток
train_df['label'] = train_df['label'].apply(
    lambda x: 1 if str(x).strip().lower() in ['1', 'true', 'real', 'yes'] else 0
)

# Разделение данных
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['combined'],
    train_df['label'],
    test_size=0.2,
    random_state=42
)

# Инициализация моделей
models = {
    'LightGBM': LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    'XGBoost': XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss',
        use_label_encoder=False
    ),
    'CatBoost': CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=7,
        random_state=42,
        verbose=0
    )
}

# Создание ансамбля
ensemble = VotingClassifier(
    estimators=[
        ('LightGBM', models['LightGBM']),
        ('XGBoost', models['XGBoost']),
        ('CatBoost', models['CatBoost'])
    ],
    voting='soft',  # Используем soft voting для вероятностей
    n_jobs=-1
)

# Создание пайплайна с ансамблем
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 2),
        stop_words='english'
    )),
    ('ensemble', ensemble)
])

# Обучение
print("Обучение ансамбля...")
pipeline.fit(X_train, y_train)

# Оценка
valid_pred = pipeline.predict(X_valid)
print("\nClassification Report:")
print(classification_report(y_valid, valid_pred))
print(f"\nF1-score: {f1_score(y_valid, valid_pred, average='weighted'):.4f}")

# Предсказание
test_pred = pipeline.predict(test_df['combined'])
results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
# results.to_csv('ensemble_predictions.csv', index=False)
print("\nПредсказания сохранены в ensemble_predictions.csv")

Обучение ансамбля...

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2249
           1       0.99      0.98      0.99      2623

    accuracy                           0.99      4872
   macro avg       0.99      0.99      0.99      4872
weighted avg       0.99      0.99      0.99      4872


F1-score: 0.9856

Предсказания сохранены в ensemble_predictions.csv


In [72]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import time

# Загрузка данных NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Инициализация
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
fake_keywords = ['shock', 'urgent', 'breaking', 'secret', 'hoax', 'fake', 'scandal', 'conspiracy']
political_keywords = ['trump', 'biden', 'election', 'poll', 'democrat', 'republican', 'congress', 'senate']

# Функция предобработки текста
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s!]', '', text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2]
    return ' '.join(words)

# Функция для подсчёта доли заглавных букв
def uppercase_ratio(text):
    if not text or len(text) == 0:
        return 0.0
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    return sum(1 for c in letters if c.isupper()) / len(letters)

# Функция для подсчёта восклицательных знаков
def exclamation_count(text):
    return text.count('!')

# Функция для доли уникальных слов
def unique_word_ratio(text):
    if not text:
        return 0.0
    words = text.split()
    if not words:
        return 0.0
    return len(set(words)) / len(words)

# Функция для оценки тональности
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

# Функция для подсчёта политических ключевых слов
def political_keyword_count(text):
    return sum(text.lower().count(kw) for kw in political_keywords)

# Функция очистки меток
def clean_label(label):
    if pd.isna(label):
        return 0
    label = str(label).strip().lower()
    return 1 if label in ['1', 'true', 'real', 'yes'] else 0

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Предобработка данных
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('')
    df['title'] = df['title'].fillna('')
    df['text_clean'] = df['text'].apply(preprocess_text)
    df['title_clean'] = df['title'].apply(preprocess_text)
    df['combined'] = df['title_clean'] + ' ' + df['text_clean']
    df['text_length'] = df['text_clean'].apply(len)
    df['title_length'] = df['title_clean'].apply(len)
    df['word_count'] = df['combined'].apply(lambda x: len(x.split()))
    df['title_word_count'] = df['title_clean'].apply(lambda x: len(x.split()))
    df['fake_keyword_count'] = df['combined'].apply(lambda x: sum(x.lower().count(kw) for kw in fake_keywords))
    df['political_keyword_count'] = df['combined'].apply(political_keyword_count)
    df['title_uppercase_ratio'] = df['title'].apply(uppercase_ratio)
    df['exclamation_count'] = df['title'].apply(exclamation_count)
    df['unique_word_ratio'] = df['combined'].apply(unique_word_ratio)
    df['title_sentiment'] = df['title'].apply(sentiment_score)

train_df['label'] = train_df['label'].apply(clean_label).astype('int8')

# Разделение данных
X = train_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']]
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Определение препроцессора
preprocessor = ColumnTransformer(
    transformers=[
        ('text_tfidf', TfidfVectorizer(max_features=25000, ngram_range=(1, 3), stop_words='english', min_df=2), 'combined'),
        ('num', 'passthrough', ['text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment'])
    ])

# Инициализация моделей
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)
class_weights = {0: 1.0, 1: scale_pos_weight}

models = {
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=3,
            reg_lambda=25.0,
            reg_alpha=25.0,
            colsample_bytree=0.6,
            random_state=42,
            verbose=-1,
            class_weight=class_weights
        ))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=3,
            colsample_bytree=0.6,
            reg_lambda=25.0,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        ))
    ]),
    'GradientBoosting': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=3,
            random_state=42
        ))
    ])
}

# 1. Оценка индивидуальных моделей
best_f1 = 0.0
best_model = None
best_model_name = None

for name, model in models.items():
    print(f"\n=== Обучение {name} ===")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    f1_train = f1_score(y_train, train_pred, average='weighted')
    f1_val = f1_score(y_val, val_pred, average='weighted')
    print(f"F1 на train: {f1_train:.5f}")
    print(f"F1 на валидации: {f1_val:.5f}")
    print(f"Время обучения: {time.time() - start_time:.2f} сек")
    if f1_val > best_f1:
        best_f1 = f1_val
        best_model = model
        best_model_name = name
        print(f"Новая лучшая модель: {name}")

# 2. Блендинг
print("\n=== Обучение блендинга ===")
val_preds = np.zeros((X_val.shape[0], len(models)))
train_preds = np.zeros((X_train.shape[0], len(models)))
for i, (name, model) in enumerate(models.items()):
    model.fit(X_train, y_train)
    val_preds[:, i] = model.predict_proba(X_val)[:, 1]
    train_preds[:, i] = model.predict_proba(X_train)[:, 1]

param_grid_meta = {
    'C': [1.0, 10.0, 100.0],
    'class_weight': [None, 'balanced'],
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l2']
}
meta_model = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid_meta, cv=5, scoring='f1_weighted', n_jobs=-1)
meta_model.fit(train_preds, y_train)
print(f"Лучшие параметры мета-модели: {meta_model.best_params_}")
blending_pred = meta_model.predict(val_preds)
blending_f1 = f1_score(y_val, blending_pred, average='weighted')
print(f"Блендинг F1 на валидации: {blending_f1:.5f}")
if blending_f1 > best_f1:
    best_f1 = blending_f1
    best_model = (models, meta_model.best_estimator_)
    best_model_name = "Blending"

# 3. Анализ ошибок
if best_model_name == "Blending":
    errors = X_val[blending_pred != y_val].copy()
    errors['true_label'] = y_val[blending_pred != y_val]
    errors['predicted_label'] = blending_pred[blending_pred != y_val]
else:
    val_pred = best_model.predict(X_val)
    errors = X_val[val_pred != y_val].copy()
    errors['true_label'] = y_val[val_pred != y_val]
    errors['predicted_label'] = val_pred[val_pred != y_val]

print("\nПримеры ошибок (первые 5):")
print(errors[['combined', 'true_label', 'predicted_label']].head())
print("\nСтатистика ошибок:")
print(f"Всего ошибок: {len(errors)}")
print(f"Ошибки для класса 0 (фейк): {len(errors[errors['true_label'] == 0])}")
print(f"Ошибки для класса 1 (реальная): {len(errors[errors['true_label'] == 1])}")

# 4. Кросс-валидация лучшей модели
print("\n=== Кросс-валидация лучшей модели ===")
if best_model_name != "Blending":
    cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='f1_weighted')
    print(f"Cross-Validation F1: {cv_scores.mean():.5f} ± {cv_scores.std():.5f}")
else:
    print("Кросс-валидация для блендинга не проводится.")

# 5. Предсказание на тестовых данных
print("\n=== Предсказание на тестовых данных ===")
if best_model_name == "Blending":
    test_preds = np.zeros((test_df.shape[0], len(models)))
    for i, (name, model) in enumerate(models.items()):
        model.fit(X_train, y_train)
        test_preds[:, i] = model.predict_proba(test_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']])[:, 1]
    test_pred = meta_model.predict(test_preds)
else:
    test_pred = best_model.predict(test_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']])

results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
#results.to_csv('final_predictions_v2.csv', index=False)
print("\nПредсказания сохранены в final_predictions_v2.csv")


=== Обучение LightGBM ===
F1 на train: 0.98216
F1 на валидации: 0.98134
Время обучения: 35.71 сек
Новая лучшая модель: LightGBM

=== Обучение XGBoost ===
F1 на train: 0.99063
F1 на валидации: 0.98513
Время обучения: 43.98 сек
Новая лучшая модель: XGBoost

=== Обучение GradientBoosting ===
F1 на train: 0.99521
F1 на валидации: 0.98533
Время обучения: 182.37 сек
Новая лучшая модель: GradientBoosting

=== Обучение блендинга ===
Лучшие параметры мета-модели: {'C': 100.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Блендинг F1 на валидации: 0.98646

Примеры ошибок (первые 5):
                                                combined  true_label  \
8139   pin drop speech father daughter kidnapped kill...           1   
13321  dont believe myth weightlifting slow home mont...           1   
10014  thing learned general contractor donald trump ...           1   
9296   saoule tout monde avec son analyse politique d...           0   
9293   independent tilt decisively trump ne

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import time

# Загрузка данных NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Инициализация
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
fake_keywords = ['shock', 'urgent', 'breaking', 'secret', 'hoax', 'fake', 'scandal', 'conspiracy']
political_keywords = ['trump', 'biden', 'election', 'poll', 'democrat', 'republican', 'congress', 'senate']

# Функция предобработки текста
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s!]', '', text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2]
    return ' '.join(words)

# Функция для подсчёта доли заглавных букв
def uppercase_ratio(text):
    if not text or len(text) == 0:
        return 0.0
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    return sum(1 for c in letters if c.isupper()) / len(letters)

# Функция для подсчёта восклицательных знаков
def exclamation_count(text):
    return text.count('!')

# Функция для доли уникальных слов
def unique_word_ratio(text):
    if not text:
        return 0.0
    words = text.split()
    if not words:
        return 0.0
    return len(set(words)) / len(words)

# Функция для оценки тональности
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

# Функция для подсчёта политических ключевых слов
def political_keyword_count(text):
    return sum(text.lower().count(kw) for kw in political_keywords)

# Функция очистки меток
def clean_label(label):
    if pd.isna(label):
        return 0
    label = str(label).strip().lower()
    return 1 if label in ['1', 'true', 'real', 'yes'] else 0

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Предобработка данных
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('')
    df['title'] = df['title'].fillna('')
    df['text_clean'] = df['text'].apply(preprocess_text)
    df['title_clean'] = df['title'].apply(preprocess_text)
    df['combined'] = df['title_clean'] + ' ' + df['text_clean']
    df['text_length'] = df['text_clean'].apply(len)
    df['title_length'] = df['title_clean'].apply(len)
    df['word_count'] = df['combined'].apply(lambda x: len(x.split()))
    df['title_word_count'] = df['title_clean'].apply(lambda x: len(x.split()))
    df['fake_keyword_count'] = df['combined'].apply(lambda x: sum(x.lower().count(kw) for kw in fake_keywords))
    df['political_keyword_count'] = df['combined'].apply(political_keyword_count)
    df['title_uppercase_ratio'] = df['title'].apply(uppercase_ratio)
    df['exclamation_count'] = df['title'].apply(exclamation_count)
    df['unique_word_ratio'] = df['combined'].apply(unique_word_ratio)
    df['title_sentiment'] = df['title'].apply(sentiment_score)

train_df['label'] = train_df['label'].apply(clean_label).astype('int8')

# Разделение данных
X = train_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']]
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Определение препроцессора
preprocessor = ColumnTransformer(
    transformers=[
        ('text_tfidf', TfidfVectorizer(max_features=25000, ngram_range=(1, 3), stop_words='english', min_df=3), 'combined'),
        ('num', 'passthrough', ['text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment'])
    ])

# Инициализация моделей
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)
class_weights = {0: 1.0, 1: scale_pos_weight}

models = {
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=3,
            reg_lambda=25.0,
            reg_alpha=25.0,
            colsample_bytree=0.6,
            random_state=42,
            verbose=-1,
            class_weight=class_weights
        ))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=3,
            colsample_bytree=0.6,
            reg_lambda=25.0,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        ))
    ]),
    'GradientBoosting': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(
            n_estimators=150,  # Ускорение
            learning_rate=0.1,
            max_depth=3,
            random_state=42
        ))
    ])
}

# 1. Оценка индивидуальных моделей с кросс-валидацией
best_f1 = 0.0
best_model = None
best_model_name = None

for name, model in models.items():
    print(f"\n=== Обучение {name} ===")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    f1_train = f1_score(y_train, train_pred, average='weighted')
    f1_val = f1_score(y_val, val_pred, average='weighted')
    print(f"F1 на train: {f1_train:.5f}")
    print(f"F1 на валидации: {f1_val:.5f}")
    print(f"Время обучения: {time.time() - start_time:.2f} сек")
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
    print(f"Кросс-валидация F1: {cv_scores.mean():.5f} ± {cv_scores.std():.5f}")
    if f1_val > best_f1:
        best_f1 = f1_val
        best_model = model
        best_model_name = name
        print(f"Новая лучшая модель: {name}")

# 2. Блендинг
print("\n=== Обучение блендинга ===")
val_preds = np.zeros((X_val.shape[0], len(models)))
train_preds = np.zeros((X_train.shape[0], len(models)))
for i, (name, model) in enumerate(models.items()):
    model.fit(X_train, y_train)
    val_preds[:, i] = model.predict_proba(X_val)[:, 1]
    train_preds[:, i] = model.predict_proba(X_train)[:, 1]

param_grid_meta = {
    'C': [10.0, 100.0, 200.0],
    'class_weight': [None, 'balanced'],
    'solver': ['liblinear'],
    'penalty': ['l2']
}
meta_model = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid_meta, cv=5, scoring='f1_weighted', n_jobs=-1)
meta_model.fit(train_preds, y_train)
print(f"Лучшие параметры мета-модели: {meta_model.best_params_}")
blending_pred = meta_model.predict(val_preds)
blending_f1 = f1_score(y_val, blending_pred, average='weighted')
print(f"Блендинг F1 на валидации: {blending_f1:.5f}")
if blending_f1 > best_f1:
    best_f1 = blending_f1
    best_model = (models, meta_model.best_estimator_)
    best_model_name = "Blending"

# 3. Анализ ошибок
if best_model_name == "Blending":
    errors = X_val[blending_pred != y_val].copy()
    errors['true_label'] = y_val[blending_pred != y_val]
    errors['predicted_label'] = blending_pred[blending_pred != y_val]
else:
    val_pred = best_model.predict(X_val)
    errors = X_val[val_pred != y_val].copy()
    errors['true_label'] = y_val[val_pred != y_val]
    errors['predicted_label'] = val_pred[val_pred != y_val]

print("\nПримеры ошибок (первые 5):")
print(errors[['combined', 'true_label', 'predicted_label']].head())
print("\nСтатистика ошибок:")
print(f"Всего ошибок: {len(errors)}")
print(f"Ошибки для класса 0 (фейк): {len(errors[errors['true_label'] == 0])}")
print(f"Ошибки для класса 1 (реальная): {len(errors[errors['true_label'] == 1])}")

# 4. Кросс-валидация лучшей модели
print("\n=== Кросс-валидация лучшей модели ===")
if best_model_name != "Blending":
    cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='f1_weighted')
    print(f"Cross-Validation F1: {cv_scores.mean():.5f} ± {cv_scores.std():.5f}")
else:
    print("Кросс-валидация для блендинга не проводится.")

# 5. Предсказание на тестовых данных
print("\n=== Предсказание на тестовых данных ===")
if best_model_name == "Blending":
    test_preds = np.zeros((test_df.shape[0], len(models)))
    for i, (name, model) in enumerate(models.items()):
        model.fit(X_train, y_train)
        test_preds[:, i] = model.predict_proba(test_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']])[:, 1]
    test_pred = meta_model.predict(test_preds)
else:
    test_pred = best_model.predict(test_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']])

results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
results.to_csv('final_predictions_v3.csv', index=False)
print("\nПредсказания сохранены в final_predictions_v3.csv")


=== Обучение LightGBM ===
F1 на train: 0.98189
F1 на валидации: 0.98155
Время обучения: 35.28 сек
Кросс-валидация F1: 0.98025 ± 0.00195
Новая лучшая модель: LightGBM

=== Обучение XGBoost ===
F1 на train: 0.99077
F1 на валидации: 0.98431
Время обучения: 43.93 сек
Кросс-валидация F1: 0.98291 ± 0.00207
Новая лучшая модель: XGBoost

=== Обучение GradientBoosting ===
F1 на train: 0.99173
F1 на валидации: 0.98492
Время обучения: 143.47 сек
Кросс-валидация F1: 0.98400 ± 0.00109
Новая лучшая модель: GradientBoosting

=== Обучение блендинга ===
Лучшие параметры мета-модели: {'C': 10.0, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
Блендинг F1 на валидации: 0.98769

Примеры ошибок (первые 5):
                                                combined  true_label  \
20857  ridiculously stupid thing men keep woman socal...           1   
9296   saoule tout monde avec son analyse politique d...           0   
9293   independent tilt decisively trump nevertrumper...           1

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib
import time

# Загрузка данных NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Инициализация
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
fake_keywords = ['shock', 'urgent', 'breaking', 'secret', 'hoax', 'fake', 'scandal', 'conspiracy']
political_keywords = ['trump', 'biden', 'election', 'poll', 'democrat', 'republican', 'congress', 'senate']

# Функции предобработки
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s!]', '', text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2]
    return ' '.join(words)

def uppercase_ratio(text):
    if not text or len(text) == 0:
        return 0.0
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    return sum(1 for c in letters if c.isupper()) / len(letters)

def exclamation_count(text):
    return text.count('!')

def emoji_count(text):
    return len(emoji.distinct_emoji_list(text))

def unique_word_ratio(text):
    if not text:
        return 0.0
    words = text.split()
    if not words:
        return 0.0
    return len(set(words)) / len(words)

def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

def political_keyword_count(text):
    return sum(text.lower().count(kw) for kw in political_keywords)

def punctuation_count(text):
    return sum(1 for c in text if c in '.,!?;:"')

def stop_word_ratio(text):
    if not text:
        return 0.0
    words = text.split()
    if not words:
        return 0.0
    return sum(1 for word in words if word.lower() in stop_words) / len(words)

def clean_label(label):
    if pd.isna(label):
        return np.nan
    label = str(label).strip().lower()
    return 1 if label in ['1', 'true', 'real', 'yes'] else 0

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Очистка данных
train_df = train_df.drop(2733, errors='ignore')
for df in [train_df, test_df]:
    df.drop([f'Unnamed: {i}' for i in range(4, 22)], axis=1, errors='ignore', inplace=True)

# Обработка пропусков
train_df['text'] = train_df['text'].fillna('')
train_df['title'] = train_df['title'].fillna('')
test_df['text'] = test_df['text'].fillna('')
test_df['title'] = test_df['title'].fillna('')
train_df['label'] = train_df['label'].apply(clean_label)
train_df = train_df.dropna(subset=['label'])
train_df['label'] = train_df['label'].astype('int8')

# Объединение title и text
train_df['combined'] = train_df['title'].apply(preprocess_text) + ' ' + train_df['text'].apply(preprocess_text)
test_df['combined'] = test_df['title'].apply(preprocess_text) + ' ' + test_df['text'].apply(preprocess_text)

# Извлечение признаков
for df in [train_df, test_df]:
    df['text_length'] = df['text'].apply(len)
    df['title_word_count'] = df['title'].apply(lambda x: len(x.split()))
    df['uppercase_ratio'] = df['text'].apply(uppercase_ratio)
    df['title_uppercase_ratio'] = df['title'].apply(uppercase_ratio)
    df['punctuation_count'] = df['combined'].apply(punctuation_count)
    df['exclamation_count'] = df['title'].apply(exclamation_count)
    df['emoji_count'] = df['title'].apply(emoji_count)
    df['stop_word_ratio'] = df['combined'].apply(stop_word_ratio)
    df['political_keyword_count'] = df['combined'].apply(political_keyword_count)
    df['unique_word_ratio'] = df['combined'].apply(unique_word_ratio)
    df['title_sentiment'] = df['title'].apply(sentiment_score)

# Признаки
feature_cols = ['text_length', 'title_word_count', 'title_uppercase_ratio', 'punctuation_count',
                'exclamation_count', 'emoji_count', 'stop_word_ratio', 'political_keyword_count',
                'unique_word_ratio', 'title_sentiment']

# Разделение данных
X = train_df[feature_cols + ['combined']]
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Препроцессор
preprocessor = ColumnTransformer(
    transformers=[
        ('text_tfidf', TfidfVectorizer(max_features=25000, ngram_range=(1, 3), stop_words='english',
                                       sublinear_tf=True, min_df=3, max_df=0.9), 'combined'),
        ('num', StandardScaler(), feature_cols)
    ])

# Модели
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)
class_weights = {0: 1.0, 1: scale_pos_weight}

models = {
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(
            n_estimators=250,
            learning_rate=0.05,
            max_depth=3,
            reg_lambda=20.0,
            reg_alpha=20.0,
            colsample_bytree=0.5,
            random_state=42,
            verbose=-1,
            class_weight=class_weights
        ))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=250,
            learning_rate=0.05,
            max_depth=3,
            colsample_bytree=0.5,
            reg_lambda=20.0,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        ))
    ]),
    'CatBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(
            iterations=250,
            learning_rate=0.05,
            depth=3,
            random_state=42,
            verbose=0,
            auto_class_weights='Balanced'
        ))
    ])
}

# Оценка моделей
best_f1 = 0.0
best_model = None
best_model_name = None

for name, model in models.items():
    print(f"\n=== Обучение {name} ===")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    f1_train = f1_score(y_train, train_pred, average='weighted')
    f1_val = f1_score(y_val, val_pred, average='weighted')
    print(f"F1 на train: {f1_train:.5f}")
    print(f"F1 на валидации: {f1_val:.5f}")
    print(f"Время обучения: {time.time() - start_time:.2f} сек")
    if f1_val > best_f1:
        best_f1 = f1_val
        best_model = model
        best_model_name = name
        print(f"Новая лучшая модель: {name}")

# Блендинг
print("\n=== Обучение блендинга ===")
val_preds = np.zeros((X_val.shape[0], len(models)))
train_preds = np.zeros((X_train.shape[0], len(models)))
for i, (name, model) in enumerate(models.items()):
    model.fit(X_train, y_train)
    val_preds[:, i] = model.predict_proba(X_val)[:, 1]
    train_preds[:, i] = model.predict_proba(X_train)[:, 1]

param_grid_meta = {
    'iterations': [150, 200],
    'learning_rate': [0.05, 0.1],
    'depth': [3]
}
meta_model = GridSearchCV(CatBoostClassifier(random_state=42, verbose=0), param_grid_meta, cv=5, scoring='f1_weighted', n_jobs=-1)
meta_model.fit(train_preds, y_train)
print(f"Лучшие параметры мета-модели: {meta_model.best_params_}")

# Взвешивание предсказаний
weights = [0.4, 0.3, 0.3]  # CatBoost, XGBoost, LightGBM
blending_pred_proba = np.sum([val_preds[:, i] * w for i, w in enumerate(weights)], axis=0)
blending_pred = (blending_pred_proba >= 0.5).astype(int)
blending_f1 = f1_score(y_val, blending_pred, average='weighted')
print(f"Блендинг F1 на валидации (взвешенный): {blending_f1:.5f}")

if blending_f1 > best_f1:
    best_f1 = blending_f1
    best_model = (models, meta_model.best_estimator_, weights)
    best_model_name = "Blending"

# Анализ ошибок
if best_model_name == "Blending":
    errors = X_val[blending_pred != y_val].copy()
    errors['true_label'] = y_val[blending_pred != y_val]
    errors['predicted_label'] = blending_pred[blending_pred != y_val]
else:
    val_pred = best_model.predict(X_val)
    errors = X_val[val_pred != y_val].copy()
    errors['true_label'] = y_val[val_pred != y_val]
    errors['predicted_label'] = val_pred[val_pred != y_val]

print("\nПримеры ошибок (первые 5):")
print(errors[['combined', 'true_label', 'predicted_label']].head())
print("\nСтатистика ошибок:")
print(f"Всего ошибок: {len(errors)}")
print(f"Ошибки для класса 0 (фейк): {len(errors[errors['true_label'] == 0])}")
print(f"Ошибки для класса 1 (реальная): {len(errors[errors['true_label'] == 1])}")

# Финальное обучение
print("\n=== Финальное обучение ===")
if best_model_name == "Blending":
    for model in models.values():
        model.fit(X, y)
    train_preds_full = np.zeros((X.shape[0], len(models)))
    for i, (name, model) in enumerate(models.items()):
        train_preds_full[:, i] = model.predict_proba(X)[:, 1]
    meta_model.best_estimator_.fit(train_preds_full, y)
else:
    best_model.fit(X, y)

# Предсказание
X_test = test_df[feature_cols + ['combined']]
if best_model_name == "Blending":
    test_preds = np.zeros((X_test.shape[0], len(models)))
    for i, (name, model) in enumerate(models.items()):
        test_preds[:, i] = model.predict_proba(X_test)[:, 1]
    test_pred_proba = np.sum([test_preds[:, i] * w for i, w in enumerate(weights)], axis=0)
    test_pred = (test_pred_proba >= 0.5).astype(int)
else:
    test_pred = best_model.predict(X_test)

results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
results.to_csv('submission_v9.csv', index=False)
print("\nПредсказания сохранены в submission_v9.csv")




=== Обучение LightGBM ===
F1 на train: 0.98236
F1 на валидации: 0.97949
Время обучения: 43.22 сек
Новая лучшая модель: LightGBM

=== Обучение XGBoost ===
F1 на train: 0.98798
F1 на валидации: 0.98276
Время обучения: 55.99 сек
Новая лучшая модель: XGBoost

=== Обучение CatBoost ===
F1 на train: 0.98757
F1 на валидации: 0.98277
Время обучения: 67.15 сек
Новая лучшая модель: CatBoost

=== Обучение блендинга ===


In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import time

# Загрузка данных NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Инициализация
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
fake_keywords = ['shock', 'urgent', 'breaking', 'secret', 'hoax', 'fake', 'scandal', 'conspiracy']
political_keywords = ['trump', 'biden', 'election', 'poll', 'democrat', 'republican', 'congress', 'senate']

# Функция предобработки текста
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s!]', '', text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2]
    return ' '.join(words)

# Функция для подсчёта доли заглавных букв
def uppercase_ratio(text):
    if not text or len(text) == 0:
        return 0.0
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    return sum(1 for c in letters if c.isupper()) / len(letters)

# Функция для подсчёта восклицательных знаков
def exclamation_count(text):
    return text.count('!')

# Функция для доли уникальных слов
def unique_word_ratio(text):
    if not text:
        return 0.0
    words = text.split()
    if not words:
        return 0.0
    return len(set(words)) / len(words)

# Функция для оценки тональности
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

# Функция для подсчёта политических ключевых слов
def political_keyword_count(text):
    return sum(text.lower().count(kw) for kw in political_keywords)

# Функция очистки меток
def clean_label(label):
    if pd.isna(label):
        return 0
    label = str(label).strip().lower()
    return 1 if label in ['1', 'true', 'real', 'yes'] else 0

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Предобработка данных
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('')
    df['title'] = df['title'].fillna('')
    df['text_clean'] = df['text'].apply(preprocess_text)
    df['title_clean'] = df['title'].apply(preprocess_text)
    df['combined'] = df['title_clean'] + ' ' + df['text_clean']
    df['text_length'] = df['text_clean'].apply(len)
    df['title_length'] = df['title_clean'].apply(len)
    df['word_count'] = df['combined'].apply(lambda x: len(x.split()))
    df['title_word_count'] = df['title_clean'].apply(lambda x: len(x.split()))
    df['fake_keyword_count'] = df['combined'].apply(lambda x: sum(x.lower().count(kw) for kw in fake_keywords))
    df['political_keyword_count'] = df['combined'].apply(political_keyword_count)
    df['title_uppercase_ratio'] = df['title'].apply(uppercase_ratio)
    df['exclamation_count'] = df['title'].apply(exclamation_count)
    df['unique_word_ratio'] = df['combined'].apply(unique_word_ratio)
    df['title_sentiment'] = df['title'].apply(sentiment_score)

train_df['label'] = train_df['label'].apply(clean_label).astype('int8')

# Разделение данных
X = train_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']]
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Определение препроцессора
preprocessor = ColumnTransformer(
    transformers=[
        ('text_tfidf', TfidfVectorizer(max_features=25000, ngram_range=(1, 3), stop_words='english', min_df=3), 'combined'),
        ('num', 'passthrough', ['text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment'])
    ])

# Инициализация моделей
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)
class_weights = {0: 1.0, 1: scale_pos_weight}

models = {
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=3,
            reg_lambda=25.0,
            reg_alpha=25.0,
            colsample_bytree=0.6,
            random_state=42,
            verbose=-1,
            class_weight=class_weights
        ))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=3,
            colsample_bytree=0.6,
            reg_lambda=25.0,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        ))
    ]),
    'GradientBoosting': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(
            n_estimators=150,  # Ускорение
            learning_rate=0.1,
            max_depth=3,
            random_state=42
        ))
    ])
}

# 1. Оценка индивидуальных моделей с кросс-валидацией
best_f1 = 0.0
best_model = None
best_model_name = None

for name, model in models.items():
    print(f"\n=== Обучение {name} ===")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    f1_train = f1_score(y_train, train_pred, average='weighted')
    f1_val = f1_score(y_val, val_pred, average='weighted')
    print(f"F1 на train: {f1_train:.5f}")
    print(f"F1 на валидации: {f1_val:.5f}")
    print(f"Время обучения: {time.time() - start_time:.2f} сек")
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
    print(f"Кросс-валидация F1: {cv_scores.mean():.5f} ± {cv_scores.std():.5f}")
    if f1_val > best_f1:
        best_f1 = f1_val
        best_model = model
        best_model_name = name
        print(f"Новая лучшая модель: {name}")

# 2. Блендинг
print("\n=== Обучение блендинга ===")
val_preds = np.zeros((X_val.shape[0], len(models)))
train_preds = np.zeros((X_train.shape[0], len(models)))
for i, (name, model) in enumerate(models.items()):
    model.fit(X_train, y_train)
    val_preds[:, i] = model.predict_proba(X_val)[:, 1]
    train_preds[:, i] = model.predict_proba(X_train)[:, 1]

param_grid_meta = {
    'C': [10.0, 100.0, 200.0],
    'class_weight': [None, 'balanced'],
    'solver': ['liblinear'],
    'penalty': ['l2']
}
meta_model = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid_meta, cv=5, scoring='f1_weighted', n_jobs=-1)
meta_model.fit(train_preds, y_train)
print(f"Лучшие параметры мета-модели: {meta_model.best_params_}")
blending_pred = meta_model.predict(val_preds)
blending_f1 = f1_score(y_val, blending_pred, average='weighted')
print(f"Блендинг F1 на валидации: {blending_f1:.5f}")
if blending_f1 > best_f1:
    best_f1 = blending_f1
    best_model = (models, meta_model.best_estimator_)
    best_model_name = "Blending"

# 3. Анализ ошибок
if best_model_name == "Blending":
    errors = X_val[blending_pred != y_val].copy()
    errors['true_label'] = y_val[blending_pred != y_val]
    errors['predicted_label'] = blending_pred[blending_pred != y_val]
else:
    val_pred = best_model.predict(X_val)
    errors = X_val[val_pred != y_val].copy()
    errors['true_label'] = y_val[val_pred != y_val]
    errors['predicted_label'] = val_pred[val_pred != y_val]

print("\nПримеры ошибок (первые 5):")
print(errors[['combined', 'true_label', 'predicted_label']].head())
print("\nСтатистика ошибок:")
print(f"Всего ошибок: {len(errors)}")
print(f"Ошибки для класса 0 (фейк): {len(errors[errors['true_label'] == 0])}")
print(f"Ошибки для класса 1 (реальная): {len(errors[errors['true_label'] == 1])}")

# 4. Кросс-валидация лучшей модели
print("\nКросс-валидация лучшей модели:")
if best_model_name != "Blending":
    cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='f1_weighted')
    print(f"Cross-Validation F1: {cv_scores.mean():.5f} ± {cv_scores.std():.5f}")
else:
    print("Кросс-валидация для блендинга не проводится.")

# 5. Предсказание на тестовых данных
print("\nПредсказание на тестовых данных:")
if best_model_name == "Blending":
    test_preds = np.zeros((test_df.shape[0], len(models)))
    for i, (name, model) in enumerate(models.items()):
        model.fit(X_train, y_train)
        test_preds[:, i] = model.predict_proba(test_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']])[:, 1]
    test_pred = meta_model.predict(test_preds)
else:
    test_pred = best_model.predict(test_df[['combined', 'text_length', 'title_length', 'word_count', 'title_word_count', 'fake_keyword_count', 'political_keyword_count', 'title_uppercase_ratio', 'exclamation_count', 'unique_word_ratio', 'title_sentiment']])

results = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
results.to_csv('predictions_v3.csv', index=False)