# Importing library

In [None]:
!pip install -q -U accelerate --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U bitsandbytes --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U transformers --no-index --find-links ../input/llm-detect-pip/


In [None]:
!pip install /kaggle/input/llm-science-exam-lib-ds/keras_core-0.1.7-py3-none-any.whl --no-deps
!pip install /kaggle/input/llm-science-exam-lib-ds/keras_nlp-0.6.2-py3-none-any.whl --no-deps

In [None]:
import sys
import gc

import ctypes
libc = ctypes.CDLL("libc.so.6")

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [None]:
import numpy as np
import pandas as pd
import time 
import os
import itertools
import regex as re
from tqdm import tqdm


from sklearn.metrics import make_scorer, accuracy_score
from scipy.sparse import csr_matrix, vstack, hstack

from concurrent.futures import ProcessPoolExecutor


# Setting hyperparameters
seed = 202 # set the seed for reproducibility  2023->202
isFixTestLeakage = True # TODO!!! reference: https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/455701
isCorrectSentence = True

In [None]:
os.environ["KERAS_BACKEND"] = "torch"
import keras_nlp
import keras_core as keras 
import keras_core.backend as K
import jax
import tensorflow as tf
from glob import glob

# Seed Everything & Correct Sentence

In [None]:
import random
def seed_everything(seed=2023):
    random.seed(seed)
    np.random.seed(seed)

seed_everything(seed)



# Importing files and Feature Engineering

In [None]:
seed_everything()

In [None]:
import sys
import pickle
import torch

FAKE_SUBMISSION = True


test_essays_csv_df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')


daigt2_val = pd.read_csv("/kaggle/input/daigt2-wo-official-with-prompt-id/validation.csv")
daigt2_val.reset_index(drop=True, inplace=True)



IS_LIVE_SUBMISSION = True
FOLLOWER_MODEL_WEIGHT = 1.1

mistral_heuristic_cache = None
ertugrul_deberta_cache = None

if IS_LIVE_SUBMISSION:
    daigt2_train = pd.read_csv("/kaggle/input/typo-intro/typo_intro.csv", sep=',')
    daigt2_train = daigt2_train.drop_duplicates(subset=['text'])
    daigt2_train.reset_index(drop=True, inplace=True)
    test = test_essays_csv_df
    MAX_EXAMPLES_TO_PREDICT = 1000000
    GBOOST_ITERATION_FRACTION = 1
    SVC_BAGGING_FRACTION = .3

else:    
    if FAKE_SUBMISSION:
        if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available.")
        sub = pd.DataFrame({'id':[],'generated':[]})
        sub.to_csv('submission.csv', index=False)
        sys.exit()
        
    daigt2_train = pd.read_csv("/kaggle/input/typo-intro/typo_intro.csv", sep=',')
    daigt2_train.reset_index(drop=True, inplace=True)

    mistral_heuristic_cache = {}
    with open("/kaggle/input/jan13-quantile-string-mistral-cache/jan_13_test_quantiles.pkl", 'rb') as quantiles_pickle_file:
        mistral_heuristic_cache["test"] = pickle.load(quantiles_pickle_file)
        
    with open("/kaggle/input/erturgrul-deberta-preds-cache-jan16/ertugrul_deberta_predictions.pkl", 'rb') as deberta_preds_file:
        ertugrul_deberta_cache = pickle.load(deberta_preds_file)

    test = pd.read_csv("/kaggle/input/daigt2-wo-official-with-prompt-id/test_no_label.csv")
    MAX_EXAMPLES_TO_PREDICT = 1200
    GBOOST_ITERATION_FRACTION = .1
    SVC_BAGGING_FRACTION = .1


In [None]:
excluded_prompt_name_list = ['Distance learning','Grades for extracurricular activities','Summer projects']
daigt2_train = daigt2_train[~(daigt2_train['prompt_name'].isin(excluded_prompt_name_list))]

daigt2_train = daigt2_train.drop_duplicates(subset=['text'])
daigt2_train.reset_index(drop=True, inplace=True)

In [None]:
def safe_logodds(prob_arr):
    clipped = np.clip(a=prob_arr, a_min=.001, a_max=.999)
    return np.log(clipped/(1-clipped))

In [None]:
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

torch.cuda.empty_cache()


if ertugrul_deberta_cache is None:

    @dataclass
    class ertugrul_debert_cfg:
        transformer_name = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-large"
        batch_size = 8
        max_len = 1024
        n_classes = 1


    ertugrul_debert_tokenizer = AutoTokenizer.from_pretrained(ertugrul_debert_cfg.transformer_name)


    def ertugrul_deberta_prepare_input(text):
        inputs = ertugrul_debert_tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=ertugrul_debert_cfg.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs


    class ErtugrulDebertaTrainDataset(Dataset):
        def __init__(self, df):
            self.texts = df['text'].values

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, item):
            inputs = ertugrul_deberta_prepare_input(self.texts[item])
            return inputs


    def ertugrul_deberta_collate(inputs):
        mask_len = int(inputs["attention_mask"].sum(axis=1).max())
        for k, v in inputs.items():
            inputs[k] = inputs[k][:,:mask_len]
        return inputs


    ertugrul_deberta_test_dataset = ErtugrulDebertaTrainDataset(test)


    ertugrul_deberta_checkpoint = torch.load("/kaggle/input/deblarge-f0-999/last.ckpt")

    class ClassicFeed(torch.nn.Module):
        def __init__(self):
            super(ClassicFeed, self).__init__()

            self.transformer = AutoModel.from_pretrained(ertugrul_debert_cfg.transformer_name)
            self.classifier = torch.nn.Linear(self.transformer.config.hidden_size, 1)

        def forward(self, input_ids, attention_mask, token_type_ids):
            output = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            last_layer_hidden_states = output.last_hidden_state[:, 0, :]
            outputs = self.classifier(last_layer_hidden_states)
            return outputs

    classic_model = ClassicFeed()
    classic_model.load_state_dict(ertugrul_deberta_checkpoint['state_dict'])

    classic_model.eval()
    trained_model = classic_model.to('cuda')
    ertugrul_deberta_dataloader = DataLoader(ertugrul_deberta_test_dataset, batch_size=ertugrul_debert_cfg.batch_size, shuffle=False)

    ertugrul_deberta_predictions = []

    for batch in tqdm(ertugrul_deberta_dataloader, position=0, leave=True):

        batch = ertugrul_deberta_collate(batch)  
        for k, v in batch.items():
            batch[k] = v


        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        token_type_ids = batch['token_type_ids'].to('cuda')

        # Wrap the prediction in torch.no_grad()
        with torch.no_grad():
            deberta_pred = trained_model(input_ids, attention_mask, token_type_ids)

        ertugrul_deberta_predictions.append(torch.sigmoid(deberta_pred))
        del batch

    ertugrul_deberta_predictions = torch.cat(ertugrul_deberta_predictions).cpu().numpy()[:,0]

    del trained_model
    del classic_model
    gc.collect()
    torch.cuda.empty_cache()
    
else:
    ertugrul_deberta_predictions = ertugrul_deberta_cache

In [None]:
from transformers import AutoModelForSequenceClassification

chunk_model = AutoModelForSequenceClassification.from_pretrained(
    "/kaggle/input/longformer-predict-model-best-check",
)

chunk_tokenizer = AutoTokenizer.from_pretrained(
    "/kaggle/input/longformer-predict-model-best-check"
)



chunk_model.to("cuda")

print("starting")
predict_model_features = []
for test_text in tqdm(test["text"].values):

    tokenized_test_text = chunk_tokenizer(test_text, return_tensors="pt")


    input_ids = tokenized_test_text['input_ids'].to('cuda')
    attention_mask = tokenized_test_text['attention_mask'].to('cuda')

    # Wrap the prediction in torch.no_grad()
    with torch.no_grad():
        chunk_pred = safe_logodds(torch.softmax(chunk_model(input_ids, attention_mask).logits[0], dim=-1).cpu().numpy())
        predict_model_features.append(chunk_pred)

    del chunk_pred
    


del chunk_model
gc.collect()
torch.cuda.empty_cache()

predict_model_features = np.array(predict_model_features)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model_name = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = None
if mistral_heuristic_cache is None:
    model = AutoModelForCausalLM.from_pretrained(
            model_name,
            load_in_4bit=True,
            quantization_config=bnb_config,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
        )

    model.config.pretraining_tp = 1 
    model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# @misc{bao2023fast,
#     url = {https://arxiv.org/abs/2310.05130},
#     author = {Bao, Guangsheng and Zhao, Yanbin and Teng, Zhiyang and Yang, Linyi and Zhang, Yue},
#     title = {Fast-DetectGPT: Efficient Zero-Shot Detection of Machine-Generated Text via Conditional Probability Curvature},
#     publisher = {arXiv},
#     year = {2023},
# }
# All code in this cell was originally copied from https://github.com/baoguangsheng/fast-detect-gpt .
# I then modified some of it for our purposes.
def get_samples(logits, labels):
    assert logits.shape[0] == 1
    assert labels.shape[0] == 1
    nsamples = 10000
    lprobs = torch.log_softmax(logits, dim=-1)
    distrib = torch.distributions.categorical.Categorical(logits=lprobs)
    samples = distrib.sample([nsamples]).permute([1, 2, 0])
    return samples

def get_likelihood(logits, labels):
    assert logits.shape[0] == 1
    assert labels.shape[0] == 1
    labels = labels.unsqueeze(-1) if labels.ndim == logits.ndim - 1 else labels
    lprobs = torch.log_softmax(logits, dim=-1)
    log_likelihood = lprobs.gather(dim=-1, index=labels)
    return log_likelihood.mean(dim=1)

def get_sampling_discrepancy(logits_ref, logits_score, labels):
    assert logits_ref.shape[0] == 1
    assert logits_score.shape[0] == 1
    assert labels.shape[0] == 1
    assert logits_ref.shape[1] == labels.shape[1]
    if logits_ref.size(-1) != logits_score.size(-1):
        # print(f"WARNING: vocabulary size mismatch {logits_ref.size(-1)} vs {logits_score.size(-1)}.")
        vocab_size = min(logits_ref.size(-1), logits_score.size(-1))
        logits_ref = logits_ref[:, :, :vocab_size]
        logits_score = logits_score[:, :, :vocab_size]

    samples = get_samples(logits_ref, labels)
    log_likelihood_x = get_likelihood(logits_score, labels)
    log_likelihood_x_tilde = get_likelihood(logits_score, samples)
    miu_tilde = log_likelihood_x_tilde.mean(dim=-1)
    sigma_tilde = log_likelihood_x_tilde.std(dim=-1)
    discrepancy = (log_likelihood_x.squeeze(-1) - miu_tilde) / sigma_tilde
    return discrepancy.item(), log_likelihood_x.squeeze(-1), miu_tilde, sigma_tilde

def get_sampling_discrepancy_analytic(logits_ref, logits_score, labels):
    assert logits_ref.shape[0] == 1
    assert logits_score.shape[0] == 1
    assert labels.shape[0] == 1
    assert logits_ref.shape[1] == labels.shape[1]
    if logits_ref.size(-1) != logits_score.size(-1):
        raise Exception()

    assert labels.ndim == logits_score.ndim - 1
    labels = labels.unsqueeze(-1)
    lprobs_score = torch.log_softmax(logits_score, dim=-1)
    probs_ref = torch.softmax(logits_ref, dim=-1)
    log_likelihood = lprobs_score.gather(dim=-1, index=labels).squeeze(-1)
    mean_ref = (probs_ref * lprobs_score).sum(dim=-1)
    var_ref = (probs_ref * torch.square(lprobs_score)).sum(dim=-1) - torch.square(mean_ref)
    discrepancy = (log_likelihood.sum(dim=-1) - mean_ref.sum(dim=-1)) / var_ref.sum(dim=-1).sqrt()
    discrepancy = discrepancy.mean()
    return discrepancy.item(), mean_ref.mean(dim=-1).item(), (var_ref.sum(dim=-1).sqrt() / mean_ref.shape[-1]).item()

In [None]:
import scipy.stats
class ImpliedTempScorer():

    def __init__(self, device="cuda"):
        self.norm = scipy.stats.norm()
        self.num_samples = 60
        self.batch_size = 20
        self.fake_normal_samples = torch.tensor(self.norm.ppf(np.linspace(1/(self.num_samples+1), 1-1/(self.num_samples + 1), self.num_samples)))
        self.fake_temp_samples = torch.exp(self.fake_normal_samples).to(torch.float16).to(device)
        self.device = device

    def get_ev_of_temp(self, logits_batch, indices):
        # logits_batch should be a tensor of shape (batch_size, num_logits)
        # indices should be a tensor of shape (batch_size)
        
        logits_batch = logits_batch.to(self.device).to(torch.float16)
        indices = indices.to(self.device)

        curr_subbatch_start = 0
        all_final_temp_scores = []
        while curr_subbatch_start < logits_batch.shape[0]:

            curr_subbatch_end = min(curr_subbatch_start + self.batch_size, logits_batch.shape[0])
            curr_subbatch = logits_batch[curr_subbatch_start:curr_subbatch_end]
            curr_indices = indices[curr_subbatch_start:curr_subbatch_end]
                

            # shape shold be (batch_size, num_logits, num_temps)
            all_logit_sets = torch.matmul(curr_subbatch.unsqueeze(-1), (1/self.fake_temp_samples).unsqueeze(0))
            assert all_logit_sets.shape == (curr_subbatch.shape[0], logits_batch.shape[1], self.fake_temp_samples.shape[0])

            probs = torch.exp(torch.nn.functional.log_softmax(all_logit_sets, dim=1))
            prob_of_logit = probs[torch.arange(curr_subbatch.shape[0]), curr_indices]
            assert prob_of_logit.shape == (curr_subbatch.shape[0], self.fake_temp_samples.shape[0])

            prob_of_temp = prob_of_logit / prob_of_logit.sum(dim=-1, keepdim=True)
            temp_evs = torch.matmul(prob_of_temp, self.fake_temp_samples).cpu().numpy()
            
            cdf_scores = self.norm.cdf(np.log(np.clip(temp_evs, 1e-10, None)))
            right_cdf_scores = 1 - cdf_scores
            log_right_cdf_scores = np.log(np.clip(right_cdf_scores, 1e-10, None))
            all_final_temp_scores.extend(log_right_cdf_scores)

            curr_subbatch_start = curr_subbatch_end

        return np.array(all_final_temp_scores)
    
implied_scorer = ImpliedTempScorer(device="cuda")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score



def batched(iterable, n):
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(itertools.islice(it, n)):
        yield batch

def get_heuristics(input_df):
    generated_perplexity = []
    human_perplexity = []
    all_perplexities = []
    generated_discs = []
    human_discs = []
    generated_logprobs = []
    human_logprobs = []
    id_to_heuristics = {}

    input_df = input_df.sort_values(by='text', key=lambda x: x.str.len(), ascending=False)
    
    np.random.seed(0)
    unique_random_token_indices = np.random.choice(32000, 2000, replace=False)
    

    
    for train_rows in tqdm(batched(input_df.iterrows(), n=10)):

        train_rows = [train_row[1] for train_row in train_rows]
        texts = [train_row["text"] for train_row in train_rows]
        example_ids = [train_row["id"] for train_row in train_rows]

        tokenized = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=800)
        input_ids = tokenized.input_ids
        mask = tokenized.attention_mask
        with torch.no_grad():
            opt = model(**tokenized)
            
        del tokenized
        gc.collect()
        torch.cuda.empty_cache()

        logits = opt.logits
        for batch_idx in range(input_ids.shape[0]):

            
            curr_example_id = example_ids[batch_idx]
            selected_logit_indices = []
            selected_input_id_indices = []
            for candidate_logit_idx in range(logits[batch_idx].shape[0]):
                if candidate_logit_idx + 1 >= input_ids[batch_idx].shape[0]:
                    continue
                
                if input_ids[batch_idx][candidate_logit_idx] == model.config.pad_token_id:
                    continue

                if input_ids[batch_idx][candidate_logit_idx + 1] == model.config.pad_token_id:
                    continue
                    
                if mask[batch_idx][candidate_logit_idx] == 0:
                    continue
                
                selected_logit_indices.append(candidate_logit_idx)
                selected_input_id_indices.append(candidate_logit_idx + 1)

            
            
            
            selected_logits = logits[batch_idx][selected_logit_indices].unsqueeze(0)
            selected_input_ids = input_ids[batch_idx][selected_input_id_indices].unsqueeze(0)
            

            temp_scores = implied_scorer.get_ev_of_temp(selected_logits[0], selected_input_ids[0])
            
            
            disc, miu, sigma = get_sampling_discrepancy_analytic(selected_logits,selected_logits, selected_input_ids)
            
            
            
            logprobs = torch.log_softmax(selected_logits, dim=-1)
            actual_logprobs = logprobs[0][torch.arange(logprobs[0].size(0)), selected_input_ids[0]].cpu().numpy()
            
            percentiles = np.percentile(logprobs[:, :, unique_random_token_indices], [10,20,30,40,50,60,70,80,90])
            
            
            actual_percentiles = np.percentile(actual_logprobs, [10,20,30,40,50,60,70,80,90])
            temp_score_percentiles = np.percentile(temp_scores, [10,20,30,40,50,60,70,80,90])

            
            id_to_heuristics[curr_example_id] = {
                "analytical_disc": disc,
                "negative_entropy": miu,
                "sigma": sigma,
                "temp_score_mean": np.mean(temp_scores),
                "temp_score_std": np.std(temp_scores),
                "logprobs_mean": np.mean(actual_logprobs),
                "logprobs_std": np.std(actual_logprobs),
                "num_tokens": selected_logits[0].shape[0],
                "logprob_percentiles": percentiles.tolist(),
                "actual_logprob_percentiles": actual_percentiles.tolist(),
                "tempscore_percentiles": temp_score_percentiles.tolist()
            }
    
    return id_to_heuristics

if mistral_heuristic_cache is None:
    test_quantiles = get_heuristics(test)
    
else:
    test_quantiles = mistral_heuristic_cache["test"]


In [None]:
del model
del implied_scorer
gc.collect()
torch.cuda.empty_cache()

In [None]:
def unpack_dict(data_dict):
    # Initialize an empty list for the flattened percentiles
    flattened_percentiles = []

    # Iterate through each sublist in the nested 'logprob_percentiles' and extend the flat list
    for percentile in data_dict["logprob_percentiles"]:
        flattened_percentiles.append(percentile)
        
    for percentile in data_dict["tempscore_percentiles"]:
        flattened_percentiles.append(percentile)
        
    for percentile in data_dict["actual_logprob_percentiles"]:
        flattened_percentiles.append(percentile)

    
    flattened_data = [
        data_dict["analytical_disc"],
        data_dict["negative_entropy"],
        data_dict["sigma"],
        data_dict["temp_score_mean"],
        data_dict["temp_score_std"],
        data_dict["logprobs_mean"],
        data_dict["logprobs_std"],
        data_dict["num_tokens"],
    ] + flattened_percentiles

    return flattened_data


In [None]:
y_train = daigt2_train['label'].values
y_val = daigt2_val['label'].values

# Extract Feature

In [None]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [None]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC(), normalizers.StripAccents(), normalizers.Strip()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
custom_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(custom_tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(daigt2_train['text'].tolist()):
    tokenized_texts_train.append(custom_tokenizer.tokenize(text))


In [None]:
def dummy(text):
    return text
vectorizer = TfidfVectorizer(
    ngram_range=(3, 7), lowercase=False, sublinear_tf=True, analyzer = 'word', max_df=0.99, max_features=5000000,
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None,
)

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

del vectorizer
gc.collect()
libc.malloc_trim(0)


vectorizer = TfidfVectorizer(ngram_range=(3, 7), lowercase=False, sublinear_tf=True, vocabulary=vocab,
    analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None
)

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

In [None]:
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler(with_mean=False)
scalar.fit(tf_train)
feature_stds = np.sqrt(scalar.var_)

In [None]:
clf = MultinomialNB(alpha=0.02)
#     clf2 = MultinomialNB(alpha=0.01)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
p6={'n_iter': int(2500 * GBOOST_ITERATION_FRACTION),
    'verbose': -1,
    'objective': 'cross_entropy',
    'learning_rate': 0.003, 
    'colsample_bytree': 0.8,
    'colsample_bynode': 0.4}

lgb=LGBMClassifier(**p6)
cat=CatBoostClassifier(iterations=int(350*GBOOST_ITERATION_FRACTION),
   verbose=0,
   l2_leaf_reg=6.6591278779517808,
   learning_rate=0.01,
   subsample = 0.4,
   allow_const_label=True,loss_function = 'CrossEntropy')


print("fitting naive bayes")
clf.fit(tf_train, y_train)
print("done fitting naive bayes")
print("fitting sgd")
sgd_model.fit(tf_train, y_train)
print("done fitting sgd")

print("predicting with SGD and MNB")
clf_preds = clf.predict_proba(tf_test)[:,1]
sgd_preds = sgd_model.predict_proba(tf_test)[:,1]

print("done predicting with SGD and MNB")

# kept_feature_indices = []
# for i in range(len(sgd_model.coef_[0])):
#     if abs(sgd_model.coef_[0][i]* feature_stds[i]) > .001:
#         kept_feature_indices.append(i)

# print(f"kept {len(kept_feature_indices)} out of {len(sgd_model.coef_[0])} features")
feature_importances = np.abs(sgd_model.coef_[0] * feature_stds)

# Get the indices of the top 3K values
kept_feature_indices = np.argsort(feature_importances)[-6000:]

# now replace the tf_train and tf_test with only the kept features
tf_train_trimmed = tf_train
tf_test_trimmed = tf_test

# print("fitting catboost")
# cat.fit(tf_train_trimmed, y_train)
# print("done fitting catboost")
print("fitting lgbm")
lgb.fit(tf_train_trimmed, y_train)
cat.fit(tf_train_trimmed, y_train)
print("done fitting lgbm")
gc.collect()

lgb_preds = .666 * lgb.predict_proba(tf_test_trimmed)[:,1] + .333 * cat.predict_proba(tf_test_trimmed)[:,1]
# cat_preds = cat.predict_proba(tf_test_trimmed)[:,1]


In [None]:
test_id_to_prob_pred = {}
for example_id, sgd_pred, mnb_pred, lgb_pred in zip(test["id"].values, sgd_preds, clf_preds, lgb_preds):
    test_id_to_prob_pred[example_id] = [sgd_pred, mnb_pred, lgb_pred]

In [None]:
unique_prompt_ids = test["prompt_id"].unique()
num_prompt_ids = len(unique_prompt_ids)

# make a map from prompt id to range(num_prompt_ids)
prompt_id_to_idx = {}
for i, prompt_id in enumerate(unique_prompt_ids):
    prompt_id_to_idx[prompt_id] = i

In [None]:
test_id_to_pseudo_label = {}
test_id_to_heuristic_features = {}
for test_row_idx, test_row in tqdm(test.iterrows()):
    test_id = test_row["id"]
    test_id_to_pseudo_label[test_id] = np.dot(test_id_to_prob_pred[test_id], [0.33, 0.09, 0.57])
    test_id_to_heuristic_features[test_id] = unpack_dict(test_quantiles[test_id])# + [mean_deberta_preds[test_id]]
    
    prompt_id_features = num_prompt_ids * [0]
    prompt_id_features[prompt_id_to_idx[test_row["prompt_id"]]] = 1
    test_id_to_heuristic_features[test_id].extend(prompt_id_features)
    

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor

heuristic_feature_matrix = []
pseudo_labels = []
for test_row_idx, test_row in tqdm(test.iterrows()):
    test_id = test_row["id"]
    pseudo_labels.append(test_id_to_pseudo_label[test_id])
    heuristic_feature_matrix.append(test_id_to_heuristic_features[test_id])
    
heuristic_feature_matrix = np.array(heuristic_feature_matrix)
pseudo_labels = safe_logodds(pseudo_labels)

model2 = GradientBoostingRegressor()
model3 = RandomForestRegressor()
model4 = LinearRegression()

# Create Soft Voting Regressor
pseudo_regressor = VotingRegressor(
    [
        ('gbr', model2), 
#         ('rfr', model3),
        ('lr', model4)
    ],
    weights=[1,1]  # equal weights for demonstration
)

print("cross val predicting")
# pred_from_heuristics = cross_val_predict(pseudo_regressor, heuristic_feature_matrix, pseudo_labels, cv=KFold(n_splits=30, shuffle=True, random_state=1))
pseudo_regressor.fit(heuristic_feature_matrix, pseudo_labels)
pred_from_heuristics = pseudo_regressor.predict(heuristic_feature_matrix)
print("done cross val predicting")

final_preds = []
for pred_from_heuristic, pseudo_label in zip(pred_from_heuristics, pseudo_labels):
    final_preds.append((FOLLOWER_MODEL_WEIGHT * pred_from_heuristic + pseudo_label) / (1 + FOLLOWER_MODEL_WEIGHT))

In [None]:
final_preds = np.array(final_preds)

In [None]:

pseudo_regressor.fit(predict_model_features, final_preds)
pred_from_model_features = pseudo_regressor.predict(predict_model_features)

In [None]:
final_preds = torch.sigmoid(torch.as_tensor(.8 * final_preds + .2 * pred_from_model_features)).numpy() * .6 + ertugrul_deberta_predictions * .4

In [None]:
sub = pd.DataFrame({'id':test["id"].values[:len(final_preds)],'generated':final_preds})
sub.to_csv('submission.csv', index=False)
sub.head(30)