In [4]:
import pandas as pd

# Doğru dosya yolları
train_data_path = r'C:\Users\Lenovo\Desktop\Murat\Kaggle Competition\Contradictory, My Dear Watson\train.csv'
test_data_path = r'C:\Users\Lenovo\Desktop\Murat\Kaggle Competition\Contradictory, My Dear Watson\test.csv'

# Verileri pandas ile yükleme
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Eğitim ve test verilerine genel bakış
print("Training Data Overview:")
print(train_data.head())  # Eğitim verilerinin ilk birkaç satırını gösterir
print("\nTest Data Overview:")
print(test_data.head())  # Test verilerinin ilk birkaç satırını gösterir

# Eğitim verilerindeki eksik değerleri kontrol etme
print("\nMissing values in training data:")
print(train_data.isnull().sum())  # Eğitim setindeki eksik verileri gösterir

# Test verilerindeki eksik değerleri kontrol etme
print("\nMissing values in test data:")
print(test_data.isnull().sum())  # Test setindeki eksik verileri gösterir


Training Data Overview:

           id                                            premise  \

0  5130fd2cb5  and these comments were considered in formulat...   

1  5b72532a0b  These are issues that we wrestle with in pract...   

2  3931fbe82a  Des petites choses comme celles-là font une di...   

3  5622f0c60b  you know they can't really defend themselves l...   

4  86aaa48b45  ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...   



                                          hypothesis lang_abv language  label  

0  The rules developed in the interim were put to...       en  English      0  

1  Practice groups are not permitted to work on t...       en  English      2  

2              J'essayais d'accomplir quelque chose.       fr   French      0  

3  They can't defend themselves because of their ...       en  English      0  

4    เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร       th     Thai      1  



Test Data Overview:

           id                                     

In [8]:
# Eksik verileri kontrol etme
print(train_data.isnull().sum())


id            0

premise       0

hypothesis    0

lang_abv      0

language      0

label         0

dtype: int64


In [10]:
from sklearn.preprocessing import LabelEncoder

# Kategorik sütunları sayısal değerlere dönüştürme
label_encoder = LabelEncoder()

# Örneğin 'lang_abv' ve 'language' sütunlarını dönüştürelim
train_data['lang_abv'] = label_encoder.fit_transform(train_data['lang_abv'])
test_data['lang_abv'] = label_encoder.transform(test_data['lang_abv'])

train_data['language'] = label_encoder.fit_transform(train_data['language'])
test_data['language'] = label_encoder.transform(test_data['language'])


In [12]:
# Özellikler (X) ve hedef değişkeni (y) ayırma
X_train = train_data.drop(columns=['label', 'id'])  # 'label' ve 'id' sütunlarını çıkarıyoruz
y_train = train_data['label']  # Hedef değişken

# Test seti için özellikler
X_test = test_data.drop(columns=['id'])


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

# TF-IDF vektörleştirme için ayrı ayrı vektörleştirici oluşturma
tfidf_premise = TfidfVectorizer(max_features=5000)  # Premise için TF-IDF
tfidf_hypothesis = TfidfVectorizer(max_features=5000)  # Hypothesis için TF-IDF

# Premise ve hypothesis sütunlarını TF-IDF ile vektörleştiriyoruz
X_train_premise = tfidf_premise.fit_transform(train_data['premise'])
X_train_hypothesis = tfidf_hypothesis.fit_transform(train_data['hypothesis'])

X_test_premise = tfidf_premise.transform(test_data['premise'])
X_test_hypothesis = tfidf_hypothesis.transform(test_data['hypothesis'])

# Vektörleştirilmiş premise ve hypothesis verilerini birleştirme
from scipy.sparse import hstack

X_train_combined = hstack([X_train_premise, X_train_hypothesis])
X_test_combined = hstack([X_test_premise, X_test_hypothesis])

# Hedef değişken (label)
y_train = train_data['label']


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Eğitim ve doğrulama setlerine ayırma
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_combined, y_train, test_size=0.2, random_state=42)

# Logistic Regression modelini eğitme
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_train_split, y_train_split)

# Doğrulama seti üzerinde tahmin yapma
y_pred = log_reg.predict(X_val)

# Doğruluk skorunu hesaplama
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Validation Accuracy: 36.06%


In [22]:
# Test seti üzerinde tahmin yapma
test_predictions = log_reg.predict(X_test_combined)

# Sonuç dosyasını oluşturma
submission = pd.DataFrame({
    'id': test_data['id'],
    'prediction': test_predictions
})

# Sonuçları submission.csv dosyasına kaydetme
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")


Submission file created successfully!


In [24]:
!pip install transformers
!pip install torch


Collecting transformers

  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)

     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--

     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--

     ----------------- -------------------- 20.5/44.4 kB 330.3 kB/s eta 0:00:01

     ----------------------------------- -- 41.0/44.4 kB 326.8 kB/s eta 0:00:01

     -------------------------------------- 44.4/44.4 kB 310.3 kB/s eta 0:00:00


Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)

  Downloading huggingface_hub-0.25.1-py3-none-any.whl.metadata (13 kB)






Collecting safetensors>=0.4.1 (from transformers)

  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)

Collecting tokenizers<0.21,>=0.20 (from transformers)

  Downloading tokenizers-0.20.0-cp312-none-win_amd64.whl.metadata (6.9 kB)









Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)

   ---------------------------------------- 0.0/9.9 

In [26]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# PyTorch için veri kümesi sınıfı tanımlama
class NLIDataset(Dataset):
    def __init__(self, premises, hypotheses, labels, tokenizer, max_len):
        self.premises = premises
        self.hypotheses = hypotheses
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, index):
        premise = self.premises[index]
        hypothesis = self.hypotheses[index]
        label = self.labels[index]

        # Premise ve Hypothesis metinlerini BERT için tokenleştirme
        encoding = self.tokenizer.encode_plus(
            premise,
            hypothesis,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'premise': premise,
            'hypothesis': hypothesis,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# BERT tokenizer'ı başlatma
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Modeli başlatma
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Eğitim ve doğrulama setine ayırma
X_train_split, X_val, y_train_split, y_val = train_test_split(
    list(zip(train_data['premise'], train_data['hypothesis'])),
    train_data['label'],
    test_size=0.2,
    random_state=42
)

# Dataset oluşturma
train_dataset = NLIDataset([x[0] for x in X_train_split], [x[1] for x in X_train_split], y_train_split, tokenizer, max_len=128)
val_dataset = NLIDataset([x[0] for x in X_val], [x[1] for x in X_val], y_val, tokenizer, max_len=128)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Modeli eğitme fonksiyonu
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_correct = 0
    total_samples = 0

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        total_correct += torch.sum(preds == labels)
        total_samples += len(labels)

        loss.backward()
        optimizer.step()

    accuracy = total_correct.double() / total_samples
    return accuracy

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Eğitim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(3):
    train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch + 1}, Training Accuracy: {train_acc:.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 4299

In [28]:
# Eğitim ve doğrulama veri setlerindeki indeksleri sıfırlayın
train_data = train_data.reset_index(drop=True)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    list(zip(train_data['premise'], train_data['hypothesis'])),
    train_data['label'],
    test_size=0.2,
    random_state=42
)


In [30]:
class NLIDataset(Dataset):
    def __getitem__(self, index):
        try:
            premise = self.premises[index]
            hypothesis = self.hypotheses[index]
            label = self.labels[index]
        except IndexError:
            print(f"Index {index} out of range for premises or labels")
            raise

        # Tokenleştirme işlemi
        encoding = self.tokenizer.encode_plus(
            premise,
            hypothesis,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'premise': premise,
            'hypothesis': hypothesis,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [32]:
# Eğitim ve doğrulama veri setlerine ayırırken indeks sıfırlaması ekleyin
X_train_split, X_val, y_train_split, y_val = train_test_split(
    list(zip(train_data['premise'], train_data['hypothesis'])),
    train_data['label'],
    test_size=0.2,
    random_state=42
)

# Bu işlemi yaptıktan sonra, her bir split'te indekslerin sıfırlandığından emin olun
X_train_split = pd.DataFrame(X_train_split).reset_index(drop=True)
X_val = pd.DataFrame(X_val).reset_index(drop=True)
y_train_split = pd.Series(y_train_split).reset_index(drop=True)
y_val = pd.Series(y_val).reset_index(drop=True)


In [34]:
# Modelin eğitimi
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):  # Epoch sayısını artırabilirsiniz
    train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch + 1}, Training Accuracy: {train_acc:.4f}")


KeyError: 3353

In [36]:
# Eğitim ve doğrulama veri setlerinin indekslerini sıfırlama
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

X_train_split, X_val, y_train_split, y_val = train_test_split(
    list(zip(train_data['premise'], train_data['hypothesis'])),
    train_data['label'],
    test_size=0.2,
    random_state=42
)

# Split edilen verilerin de indekslerini sıfırlayın
X_train_split = pd.DataFrame(X_train_split).reset_index(drop=True)
X_val = pd.DataFrame(X_val).reset_index(drop=True)
y_train_split = pd.Series(y_train_split).reset_index(drop=True)
y_val = pd.Series(y_val).reset_index(drop=True)


In [38]:
print(f"Train labels length: {len(y_train_split)}")
print(f"Val labels length: {len(y_val)}")

# İndeks hatası olup olmadığını kontrol edin
try:
    print(f"Accessing label at index 3353: {y_train_split[3353]}")
except KeyError as e:
    print(f"Error: {e}")


Train labels length: 9696

Val labels length: 2424

Accessing label at index 3353: 1


In [40]:
class NLIDataset(Dataset):
    def __getitem__(self, index):
        # İndeksin mevcut olup olmadığını kontrol et
        if index >= len(self.labels):
            raise IndexError(f"Index {index} out of bounds for labels of length {len(self.labels)}")
        
        premise = self.premises[index]
        hypothesis = self.hypotheses[index]
        label = self.labels[index]

        # Premise ve Hypothesis metinlerini BERT için tokenleştirme
        encoding = self.tokenizer.encode_plus(
            premise,
            hypothesis,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'premise': premise,
            'hypothesis': hypothesis,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [47]:
# Modeli eğitme ve doğruluk takibi
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Cihazı kontrol etme (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Eğitim döngüsü
for epoch in range(3):  # Daha fazla epoch sayısı artırılabilir
    train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch + 1}, Training Accuracy: {train_acc:.4f}")

    # Validation seti üzerinde doğruluk kontrolü
    val_acc = validate(model, val_loader, device)
    print(f"Epoch {epoch + 1}, Validation Accuracy: {val_acc:.4f}")


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


KeyError: 3378

In [49]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset

# DistilBERT tokenizer ve modelini yükleme
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Cihaz kontrolü (GPU veya CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
class NLIDataset(Dataset):
    def __init__(self, premises, hypotheses, labels, tokenizer, max_len):
        self.premises = premises
        self.hypotheses = hypotheses
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, index):
        premise = self.premises[index]
        hypothesis = self.hypotheses[index]
        label = self.labels[index]

        # Premise ve Hypothesis metinlerini DistilBERT için tokenleştirme
        encoding = self.tokenizer.encode_plus(
            premise,
            hypothesis,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [53]:
from sklearn.model_selection import train_test_split

# Eğitim ve doğrulama setlerine ayırma
X_train_split, X_val, y_train_split, y_val = train_test_split(
    list(zip(train_data['premise'], train_data['hypothesis'])),
    train_data['label'],
    test_size=0.2,
    random_state=42
)

# Dataset oluşturma
train_dataset = NLIDataset([x[0] for x in X_train_split], [x[1] for x in X_train_split], y_train_split, tokenizer, max_len=128)
val_dataset = NLIDataset([x[0] for x in X_val], [x[1] for x in X_val], y_val, tokenizer, max_len=128)

# DataLoader ile veri yükleyici oluşturma
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [55]:
from transformers import AdamW

# Optimizer tanımlama
optimizer = AdamW(model.parameters(), lr=2e-5)

# Eğitim fonksiyonu
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_correct = 0
    total_samples = 0

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        total_correct += torch.sum(preds == labels)
        total_samples += len(labels)

        loss.backward()
        optimizer.step()

    accuracy = total_correct.double() / total_samples
    return accuracy

# Modeli eğitme döngüsü
for epoch in range(3):  # Epoch sayısını artırabilirsiniz
    train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch + 1}, Training Accuracy: {train_acc:.4f}")

    # Validation seti üzerinde doğruluk kontrolü
    val_acc = validate(model, val_loader, device)
    print(f"Epoch {epoch + 1}, Validation Accuracy: {val_acc:.4f}")





KeyError: 5528

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Premise ve Hypothesis sütunlarını birleştiriyoruz
train_data['combined_text'] = train_data['premise'] + ' ' + train_data['hypothesis']
test_data['combined_text'] = test_data['premise'] + ' ' + test_data['hypothesis']

# TF-IDF vektörleştirici
tfidf = TfidfVectorizer(max_features=5000)  # Maksimum 5000 özellik ile sınırladık

# Eğitim ve test setindeki metinleri TF-IDF ile dönüştürme
X_train_tfidf = tfidf.fit_transform(train_data['combined_text'])
X_test_tfidf = tfidf.transform(test_data['combined_text'])

# Hedef değişkeni (label)
y_train = train_data['label']


In [59]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# Eğitim ve doğrulama setlerine ayırma
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# LightGBM modelini başlatma
lgb_model = lgb.LGBMClassifier(random_state=42)

# Modeli eğitme
lgb_model.fit(X_train_split, y_train_split)

# Doğrulama seti üzerinde tahmin yapma
y_val_pred = lgb_model.predict(X_val)

# Doğruluk skorunu hesaplama
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033760 seconds.

You can set `force_col_wise=true` to remove the overhead.

[LightGBM] [Info] Total Bins 31437

[LightGBM] [Info] Number of data points in the train set: 9696, number of used features: 1223

[LightGBM] [Info] Start training from score -1.070244

[LightGBM] [Info] Start training from score -1.138056

[LightGBM] [Info] Start training from score -1.088760

Validation Accuracy: 37.62%


In [61]:
# Test seti üzerinde tahmin yapma
test_predictions = lgb_model.predict(X_test_tfidf)

# Sonuç dosyasını oluşturma
submission = pd.DataFrame({
    'id': test_data['id'],
    'prediction': test_predictions
})

# Sonuçları submission.csv dosyasına kaydetme
submission.to_csv('submission_lgb.csv', index=False)
print("LightGBM submission file created successfully!")


LightGBM submission file created successfully!


In [63]:
from sklearn.model_selection import GridSearchCV

# LightGBM için parametre aralığı belirleme
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1]
}

# GridSearchCV ile en iyi parametreleri bulma
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# En iyi parametreleri yazdırma
print(f"Best Parameters: {grid_search.best_params_}")

# En iyi modeli seçme
best_model = grid_search.best_estimator_

# Validation seti üzerinde en iyi modelin performansını kontrol etme
y_val_pred_best = best_model.predict(X_val)
best_val_accuracy = accuracy_score(y_val, y_val_pred_best)
print(f"Best Validation Accuracy: {best_val_accuracy * 100:.2f}%")


Fitting 3 folds for each of 27 candidates, totalling 81 fits

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032345 seconds.

You can set `force_row_wise=true` to remove the overhead.

And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Total Bins 31437

[LightGBM] [Info] Number of data points in the train set: 9696, number of used features: 1223

[LightGBM] [Info] Start training from score -1.070244

[LightGBM] [Info] Start training from score -1.138056

[LightGBM] [Info] Start training from score -1.088760



























































Best Parameters: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}

Best Validation Accuracy: 41.50%


In [65]:
from sklearn.model_selection import GridSearchCV

# LightGBM için daha geniş bir parametre aralığı
param_grid = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [10, 20, 30, 50],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'num_leaves': [20, 30, 40],
    'min_child_samples': [20, 30, 40],
    'subsample': [0.6, 0.8, 1.0]
}

# GridSearchCV ile en iyi parametreleri bulma
grid_search = GridSearchCV(estimator=lgb.LGBMClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# En iyi parametreleri yazdırma
print(f"Best Parameters: {grid_search.best_params_}")

# En iyi modeli seçme
best_model = grid_search.best_estimator_

# Validation seti üzerinde en iyi modelin performansını kontrol etme
y_val_pred_best = best_model.predict(X_val)
best_val_accuracy = accuracy_score(y_val, y_val_pred_best)
print(f"Best Validation Accuracy: {best_val_accuracy * 100:.2f}%")


Fitting 3 folds for each of 1728 candidates, totalling 5184 fits

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036590 seconds.

You can set `force_row_wise=true` to remove the overhead.

And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Total Bins 31437

[LightGBM] [Info] Number of data points in the train set: 9696, number of used features: 1223

[LightGBM] [Info] Start training from score -1.070244

[LightGBM] [Info] Start training from score -1.138056

[LightGBM] [Info] Start training from score -1.088760

Best Parameters: {'learning_rate': 0.001, 'max_depth': 30, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 30, 'subsample': 0.6}

Best Validation Accuracy: 41.91%


In [67]:
# En iyi parametrelerle modeli yeniden eğitme
best_params = grid_search.best_params_

# En iyi parametrelerle LightGBM modelini başlatma
best_lgb_model = lgb.LGBMClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    num_leaves=best_params['num_leaves'],
    min_child_samples=best_params['min_child_samples'],
    subsample=best_params['subsample'],
    random_state=42
)

# En iyi model ile eğitim setini yeniden eğitme
best_lgb_model.fit(X_train_split, y_train_split)

# Validation seti üzerinde en iyi modelin performansını kontrol etme
y_val_pred_best = best_lgb_model.predict(X_val)
best_val_accuracy = accuracy_score(y_val, y_val_pred_best)
print(f"Validation Accuracy with Best Parameters: {best_val_accuracy * 100:.2f}%")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037779 seconds.

You can set `force_row_wise=true` to remove the overhead.

And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Total Bins 31437

[LightGBM] [Info] Number of data points in the train set: 9696, number of used features: 1223

[LightGBM] [Info] Start training from score -1.070244

[LightGBM] [Info] Start training from score -1.138056

[LightGBM] [Info] Start training from score -1.088760

Validation Accuracy with Best Parameters: 41.91%


In [69]:
# Test seti üzerinde tahmin yapma
test_predictions_best = best_lgb_model.predict(X_test_tfidf)

# Sonuçları submission dosyasına kaydetme
submission = pd.DataFrame({
    'id': test_data['id'],
    'prediction': test_predictions_best
})

# Sonuçları submission.csv dosyasına kaydetme
submission.to_csv('submission_lgb_best.csv', index=False)
print("Submission file with best parameters created successfully!")


Submission file with best parameters created successfully!
