In [1]:
!pip install sentence_transformers
!pip install peft
!pip install demoji
!pip install spacy
!pip install scikit-learn
!python -m spacy download en_core_web_sm

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.1
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl.metadata (9.2 kB)
Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import pandas as pd
import re\
import torch
import demoji
import spacy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Download resources only once at the top-level ---

# Ensure that necessary NLTK and demoji resources are available
nlp = spacy.load('en_core_web_sm') # Lemmatization, Tokenizer and Stopwords
if not demoji.last_downloaded_timestamp():
    demoji.download_codes()  # Only download codes once, skip if already done

# --- Feature Extractor Transformer ---

class feature_extractor(BaseEstimator, TransformerMixin):
    """
    Extracts specific columns from a DataFrame.

    Args:
        features: List of feature names to extract from the DataFrame.
    """
    def __init__(self, features: list[str]):
        self.features = features

    def fit(self, X, y=None):
        return self  # No fitting required

    def transform(self, X):
        # Keep only the specified columns
        existing_features = [feature for feature in self.features if feature in X.columns]
        X_transformed = X[existing_features].copy()
        return X_transformed


# --- Clear Columns Transformer ---
class clear_columns(BaseEstimator, TransformerMixin):
    """
    Cleans the specified text columns in a DataFrame by applying several cleaning steps:
    - Removes non-alphabetic characters.
    - Removes short words (less than 3 characters).
    - Removes URLs.
    - Replaces emoji with descriptive text.
    - Removes special characters like hashtags and HTML tags.
    - Applies stemming and removes stopwords.

    Args:
        features: List of features to clean.
    """
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self  # No fitting required

    def transform(self, data):
        # Apply cleaning to each feature in the list
        for feature in self.features:
            if feature in data.columns:
                data[feature] = data[feature].apply(self.clean_text)
        return data

    def clean_text(self, text):
        """Cleans a single text entry using multiple regex and NLP techniques."""
        if pd.isnull(text):
            return text  # Return if null value

        text = str(text).lower()
        text = demoji.replace_with_desc(text)
        text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', 'url', text)
        text = re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', 'html', text)
        text = re.sub(' +', ' ', text).strip()
        text = ' '.join([token.lemma_ for token in nlp(text) if not token.is_stop])

        return text.strip()


# --- Merge Columns Transformer ---
class merge_columns(BaseEstimator, TransformerMixin):
    """
    Merges multiple text columns into one, with an option to drop the original columns.

    Args:
        features: List of features to merge.
        new_feature_name: The name of the new merged column.
        drop_original: If True, drops the original columns after merging unless the new column name matches one of the old ones.
    """
    def __init__(self, features, new_feature_name='merged_text', drop_original=True):
        self.features = features
        self.new_feature_name = new_feature_name
        self.drop_original = drop_original

    def fit(self, X, y=None):
        return self  # No fitting required

    def transform(self, X):
        # Merge the columns into one
        X[self.new_feature_name] = X[self.features].apply(self.merge_text, axis=1)

        # Drop original columns if requested, except if the new feature name matches one of the originals
        if self.drop_original:
            features_to_drop = [f for f in self.features if f != self.new_feature_name]
            if features_to_drop:
                X = X.drop(columns=features_to_drop)

        return X

    def merge_text(self, row):
        """Merges the content of the specified columns into a single string."""
        return (' '.join([str(row[feature]) for feature in self.features if pd.notnull(row[feature])])).strip()

  from tqdm.autonotebook import tqdm, trange


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [4]:
def read_datastore() -> tuple[pd.DataFrame, pd.DataFrame, str]:
    """
    Read the sample data.
    """
    
    train_path = '/kaggle/input/dataset/train.csv' # другой путь, в kaggle делал
    test_path = '/kaggle/input/dataset/test.csv'   # другой путь, в kaggle делал

    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    return train_data, test_data

def tokenize_dataset(dataset):
    return tokenizer(dataset['text'])

In [5]:
df_train, df_test = read_datastore()

In [6]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [7]:
# Assuming df is your DataFrame and you want to apply transformations to all columns
all_features = df_train.columns.tolist()[:-1]
new_feature_name = 'merged_text'

# Create the pipeline
pipeline = Pipeline([
    ('feature_extractor', feature_extractor(features=['text', 'target'])),
#     ('clean_columns', clear_columns(features=['text'])),
])

# Example usage with a DataFrame `df`
transformed_df = pipeline.fit_transform(df_train)

In [8]:
transformed_df

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [9]:
import datasets

train_dataset = datasets.Dataset.from_pandas(transformed_df)
dataset = datasets.DatasetDict({
    "train": train_dataset
})

dataset = dataset.rename_column('target', 'label')
dataset = dataset.cast_column('label', datasets.ClassLabel(num_classes=2, names=['negative', 'positive']))
dataset = dataset.map(tokenize_dataset, batched=True)
dataset

Casting the dataset:   0%|          | 0/7613 [00:00<?, ? examples/s]

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7613
    })
})

In [10]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(p):
    y_true = p.label_ids
    y_pred = p.predictions.argmax(axis=1)
    precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import numpy as np

In [12]:
import wandb

wandb.login(key='2b54d6ba06d99adb32b2f4b8e5061a6fb40be4a0')

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [13]:
## Training arguments
training_args = TrainingArguments(
    report_to='none',
    output_dir='./results',
    evaluation_strategy='no',
    save_strategy='no',
    logging_strategy='steps',
    logging_steps=10,
    fp16=True,
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    weight_decay=0.01,
    warmup_ratio=0.1,
    eval_steps=16,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=None,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

model.save_pretrained("./models/first_lora")


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
10,0.6988
20,0.7
30,0.6725
40,0.6669
50,0.6037
60,0.6502
70,0.6657
80,0.6251
90,0.5526
100,0.4706


In [14]:
# Create the pipeline
pipeline = Pipeline([
    ('feature_extractor', feature_extractor(features=['id', 'text'])),
#     ('merge_columns', merge_columns(features=['location', 'text'], new_feature_name='text')),
#     ('clean_columns', clear_columns(features=['text'])),
])

transformed_df_test = pipeline.fit_transform(df_test)

In [15]:
transformed_df_test['id']

0           0
1           2
2           3
3           9
4          11
        ...  
3258    10861
3259    10865
3260    10868
3261    10874
3262    10875
Name: id, Length: 3263, dtype: int64

In [16]:
test_dataset = datasets.Dataset.from_pandas(transformed_df_test)
dataset_test = datasets.DatasetDict({
    "test": test_dataset
})


dataset_test = dataset_test.map(tokenize_dataset, batched=True)

predictions, _, _ = trainer.predict(dataset_test['test'])

submission_df = pd.DataFrame({
    'id': dataset_test['test']['id'],
    'target': predictions.argmax(axis=1)
})
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [17]:
submission_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
# model.save_pretrained("./new_models1/model")
# tokenizer.save_pretrained("./new_tokenizer1/tokenizer")