In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
from itertools import product
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
from transformers import pipeline
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [2]:
# Loading data from Processing notebook

data = np.load("combined_representations.npz", allow_pickle=True)
X_train_combined = data["X_train"]
X_val_combined = data["X_val"]
test_combined = data["test"]
y_train = data["y_train"]
y_val = data["y_val"]

In [3]:
test_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.26894767,
         0.28410659, -0.05457572],
       [ 0.        ,  0.        ,  0.        , ..., -0.18193519,
         0.56684933,  0.1033013 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.15412894,
         0.41844097, -0.1535055 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.20144454,
         0.56314219, -0.01820332],
       [ 0.        ,  0.        ,  0.11503044, ..., -0.09101021,
         0.50645848,  0.04244453],
       [ 0.        ,  0.        ,  0.        , ..., -0.29217856,
         0.44455622,  0.144129  ]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.12193654,
        -0.0445922 , -0.00576038],
       [ 0.        ,  0.        ,  0.        , ..., -0.1794887 ,
         0.44336581,  0.22292026],
       [ 0.        ,  0.08749985,  0.08937212, ..., -0.03605678,
         0.48175168,  0.20329235],
       ...,
       [ 0.        ,  0.        ,  0. 

In [4]:
X_val_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.28789246,
         0.48497257,  0.15152374],
       [ 0.        ,  0.        ,  0.        , ..., -0.06563767,
         0.40370935,  0.0725085 ],
       [ 0.        ,  0.        ,  0.13880652, ..., -0.16953171,
         0.3015066 ,  0.14860522],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.10403167,
        -0.09614274,  0.04147864],
       [ 0.        ,  0.        ,  0.        , ..., -0.16573102,
         0.38612467,  0.25377321],
       [ 0.        ,  0.        ,  0.        , ..., -0.121358  ,
         0.53756869,  0.1067788 ]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.36170898,
         0.30741972,  0.08949168],
       [ 0.        ,  0.        ,  0.        , ..., -0.11652934,
         0.34135978,  0.214483  ],
       [ 0.        ,  0.        ,  0.        , ..., -0.20559426,
         0.22731568,  0.1786795 ],
       ...,
       [ 0.        ,  0.        ,  0. 

In [5]:
X_train_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.18555058,
         0.44161261, -0.0627408 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.0345583 ,
         0.04763634, -0.02800948],
       [ 0.        ,  0.        ,  0.        , ..., -0.08426904,
         0.34936657,  0.0997809 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.21587891,
         0.39432905,  0.11549282],
       [ 0.        ,  0.        ,  0.        , ..., -0.23952458,
         0.445471  ,  0.19499586],
       [ 0.        ,  0.        ,  0.        , ..., -0.12781864,
         0.43982258,  0.14028625]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.28434646,
        -0.07843604, -0.0471223 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.0291326 ,
         0.04489934, -0.0054476 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.25649588,
         0.346195  ,  0.22571714],
       ...,
       [ 0.        ,  0.        ,  0. 

In [6]:
y_val.shape

(1562,)

In [7]:
y_train.shape

(4686,)

### Preparing data for the models

In [8]:
# For X_train_combined

train_dict = X_train_combined.item()

description_features_train = train_dict['description']
host_about_features_train = train_dict['host_about']
comments_features_train = train_dict['comments']

X_train_merged = np.concatenate((description_features_train, host_about_features_train, comments_features_train), axis=1)
print("Shape of merged feature matrix:", X_train_merged.shape)

Shape of merged feature matrix: (4686, 1800)


In [9]:
# For X_val_combined

val_dict = X_val_combined.item()

description_features_val = val_dict['description']
host_about_features_val = val_dict['host_about']
comments_features_val = val_dict['comments']

X_val_merged = np.concatenate((description_features_val, host_about_features_val, comments_features_val), axis=1)
print("Shape of merged feature matrix:", X_val_merged.shape)

Shape of merged feature matrix: (1562, 1800)


In [10]:
# For test_combined

test_dict = test_combined.item()

description_features_test = test_dict['description']
host_about_features_test = test_dict['host_about']
comments_features_test = test_dict['comments']

X_test_merged = np.concatenate((description_features_test, host_about_features_test, comments_features_test), axis=1)
print("Shape of merged feature matrix:", X_test_merged.shape)

Shape of merged feature matrix: (695, 1800)


# Models
##### Grid Searches are small because that's not the main point

### Logistic Regression

In [11]:
# Define values for grid search
parameters = {
    'C': [0.005, 0.01, 0.05],  
    'penalty': ['l1', 'l2'],  
    'solver': ['liblinear', 'saga'],      
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['C']) * len(parameters['penalty']) * len(parameters['solver'])
pbar = tqdm(total=total_combinations)

# Iterate over all combinations of parameters
for C, penalty, solver in product(parameters['C'], parameters['penalty'], parameters['solver']):

    pbar.update(1)
    lr = LogisticRegression(C=C, penalty=penalty, solver=solver, random_state=0)

    lr.fit(X_train_merged, y_train)
    f1 = f1_score(y_val, lr.predict(X_val_merged), average='weighted')
    
    # Check if score is the best
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'C': C, 'penalty': penalty, 'solver': solver}

pbar.close()

# Use the best parameter combination found
print("Best parameters found:", best_params)
best_lr = LogisticRegression(**best_params, random_state=0)
best_lr.fit(X_train_merged, y_train)

# Predictions
lr_pred = best_lr.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_lr.score(X_val_merged, y_val))
print(classification_report(y_val, lr_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, lr_pred))

f1 = f1_score(y_val, lr_pred, average='weighted')
print("F1 score on validation set:", f1)

100%|██████████| 12/12 [00:25<00:00,  2.12s/it]


Best parameters found: {'C': 0.05, 'penalty': 'l2', 'solver': 'saga'}
F1 score on validation set with best parameters: 0.8597951344430218
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      1135
           1       0.70      0.85      0.77       427

    accuracy                           0.86      1562
   macro avg       0.82      0.86      0.83      1562
weighted avg       0.87      0.86      0.86      1562

Confusion Matrix:
[[981 154]
 [ 65 362]]
F1 score on validation set: 0.8635507089522677


In [12]:
# Best F1: 0.8635507089522677 ; for: {'C': 0.05, 'penalty': 'l2', 'solver': 'saga'}

### K Nearest Neighbors

In [13]:
# Values for grid search
parameters = {
    'n_neighbors': [3, 5, 7],    
    'weights': ['uniform', 'distance'],    
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],   
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['n_neighbors']) * len(parameters['weights']) * len(parameters['algorithm'])
pbar = tqdm(total=total_combinations)

# Iterate over all combinations of parameters
for n_neighbors, weights, algorithm in product(parameters['n_neighbors'], parameters['weights'], parameters['algorithm']):

    pbar.update(1)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
    
    knn.fit(X_train_merged, y_train)
    knn_pred = knn.predict(X_val_merged)
    
    f1 = f1_score(y_val, knn_pred, average='weighted')
    
    # Check if score is the best
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm}

pbar.close()

# Use the best model found
print("Best parameters found:", best_params)
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train_merged, y_train)

# Predictions
knn_pred = best_knn.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_knn.score(X_val_merged, y_val))
print(classification_report(y_val, knn_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, knn_pred))

f1 = f1_score(y_val, knn_pred, average='weighted')
print("F1 score on validation set:", f1)

100%|██████████| 24/24 [04:55<00:00, 12.30s/it]


Best parameters found: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree'}
F1 score on validation set with best parameters: 0.8623559539052497
              precision    recall  f1-score   support

           0       0.92      0.89      0.90      1135
           1       0.73      0.78      0.76       427

    accuracy                           0.86      1562
   macro avg       0.82      0.84      0.83      1562
weighted avg       0.87      0.86      0.86      1562

Confusion Matrix:
[[1014  121]
 [  94  333]]
F1 score on validation set: 0.8636367002481635


In [14]:
# Best F1: 0.8636367002481635 ; for: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree'}

### MLP

In [15]:
# Parameters for grid search
parameters = {
    'hidden_layer_sizes': [(2, 2), (5, 5), (2, 2, 2)],    
    'activation': ['logistic', 'relu', 'tanh'],    
    'solver': ['lbfgs', 'sgd'],     
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['hidden_layer_sizes']) * len(parameters['activation']) * len(parameters['solver'])
pbar = tqdm(total=total_combinations)

# Perform grid search
for hidden_layer_sizes, activation, solver in product(parameters['hidden_layer_sizes'], parameters['activation'], parameters['solver']):

    pbar.update(1)
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, random_state=0)

    mlp.fit(X_train_merged, y_train)
    mlp_pred = mlp.predict(X_val_merged)

    f1 = f1_score(y_val, mlp_pred, average='weighted')
    
    # Check if best score
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'hidden_layer_sizes': hidden_layer_sizes, 'activation': activation, 'solver': solver}

pbar.close()

# Use the best model found by manual search
print("Best parameters found:", best_params)
best_mlp = MLPClassifier(**best_params, random_state=0)
best_mlp.fit(X_train_merged, y_train)

# Predictions
mlp_pred = best_mlp.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_mlp.score(X_val_merged, y_val))
print(classification_report(y_val, mlp_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, mlp_pred))

f1 = f1_score(y_val, mlp_pred, average='weighted')
print("F1 score on validation set:", f1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Best parameters found: {'hidden_layer_sizes': (2, 2), 'activation': 'relu', 'solver': 'lbfgs'}
F1 score on validation set with best parameters: 0.8719590268886044
              precision    recall  f1-score   support

           0       0.93      0.90      0.91      1135
           1       0.75      0.81      0.78       427

    accuracy                           0.87      1562
   macro avg       0.84      0.85      0.84      1562
weighted avg       0.88      0.87      0.87      1562

Confusion Matrix:
[[1017  118]
 [  82  345]]
F1 score on validation set: 0.8735169555405469


In [16]:
# Best F1: 0.8735169555405469 ; for: {'hidden_layer_sizes': (2, 2), 'activation': 'relu', 'solver': 'lbfgs'}

# Extra Models

### Random Forest

In [17]:
# Values for grid search
parameters_rf = {
    'n_estimators': [100, 200],     
    'max_depth': [None, 10, 20],      
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2]     
}

best_f1_rf = 0
best_params_rf = None

# Progression bar
total_combinations_rf = len(parameters_rf['n_estimators']) * len(parameters_rf['max_depth']) * len(parameters_rf['min_samples_split']) * len(parameters_rf['min_samples_leaf'])
pbar_rf = tqdm(total=total_combinations_rf)

# Perform grid search
for n_estimators, max_depth, min_samples_split, min_samples_leaf in product(parameters_rf['n_estimators'], parameters_rf['max_depth'], parameters_rf['min_samples_split'], parameters_rf['min_samples_leaf']):

    pbar_rf.update(1)

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=0)
    rf.fit(X_train_merged, y_train)
    
    # Make predictions
    rf_pred = rf.predict(X_val_merged)
    f1_rf = f1_score(y_val, rf_pred, average='weighted')
    
    # Check if it's the best score
    if f1_rf > best_f1_rf:
        best_f1_rf = f1_rf
        best_params_rf = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

pbar_rf.close()

# Use the best model found
print("Best parameters found for Random Forest:", best_params_rf)
best_rf = RandomForestClassifier(**best_params_rf, random_state=0)
best_rf.fit(X_train_merged, y_train)

# Predictions
rf_pred = best_rf.predict(X_val_merged)

print("F1 score on validation set with best parameters for Random Forest:", best_rf.score(X_val_merged, y_val))
print(classification_report(y_val, rf_pred))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_val, rf_pred))

f1_rf = f1_score(y_val, rf_pred, average='weighted')
print("F1 score on validation set for Random Forest:", f1_rf)

100%|██████████| 36/36 [15:46<00:00, 26.28s/it]


Best parameters found for Random Forest: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}
F1 score on validation set with best parameters for Random Forest: 0.885403329065301
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1135
           1       0.77      0.83      0.80       427

    accuracy                           0.89      1562
   macro avg       0.85      0.87      0.86      1562
weighted avg       0.89      0.89      0.89      1562

Confusion Matrix for Random Forest:
[[1028  107]
 [  72  355]]
F1 score on validation set for Random Forest: 0.8867618796745956


In [18]:
# Best F1: 0.8867618796745956 ; for: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}

### Transformer based Models

In [19]:
#pip install transformers[torch] accelerate

In [31]:
# Load the data
data = np.load("transformers_embedding.npz")

X_train_tf = data['X_train']
X_val_tf = data['X_val']
test_tf = data['test']
y_train_tf = data['y_train']
y_val_tf = data['y_val']

In [32]:
X_train_tf

array([[-0.06845481, -0.02141805,  0.02666341, ...,  0.07339355,
         0.12587708,  0.26303443],
       [-0.07528951, -0.00832612,  0.0243064 , ..., -0.09779873,
         0.11954173,  0.22345306],
       [-0.04998843, -0.01504492,  0.10069712, ...,  0.04953092,
         0.11734806,  0.2702796 ],
       ...,
       [-0.09108053, -0.02430411,  0.11410011, ...,  0.06817398,
         0.06884317,  0.25061488],
       [-0.04224196, -0.046772  ,  0.06011906, ...,  0.04457448,
         0.06357309,  0.26463324],
       [-0.06942889, -0.04473429,  0.03644358, ...,  0.03346692,
         0.13133377,  0.28527457]], dtype=float32)

In [33]:
y_val_tf

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(SimpleClassifier, self).__init__()
        self.classifier = nn.Linear(input_dim, num_labels)
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        logits = self.classifier(input_ids)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))
        return (loss, logits) if loss is not None else logits

# Define input dimension 
input_dim = X_train_tf.shape[1]
num_labels = 2

# Initialize the custom model
model = SimpleClassifier(input_dim=input_dim, num_labels=num_labels)

class EmbeddingsDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.embeddings[idx], dtype=torch.float),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item


train_dataset = EmbeddingsDataset(X_train_tf, y_train_tf)
eval_dataset = EmbeddingsDataset(X_val_tf, y_val_tf)

In [24]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.25,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()

Step,Training Loss
500,0.5307
1000,0.3826
1500,0.3548
2000,0.3511
2500,0.3476
3000,0.3481
3500,0.3439
4000,0.345


TrainOutput(global_step=4410, training_loss=0.3726497208991018, metrics={'train_runtime': 60.0921, 'train_samples_per_second': 2339.411, 'train_steps_per_second': 73.387, 'total_flos': 0.0, 'train_loss': 0.3726497208991018, 'epoch': 30.0})

In [25]:
# Get predictions on the validation dataset
predictions = trainer.predict(eval_dataset)

# Extract predicted labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

f1 = f1_score(y_val_tf, predicted_labels, average='weighted')  # or 'macro', 'micro' depending on your needs

print(f"F1 Score: {f1}")

F1 Score: 0.8627553443175473


In [26]:
# F1: 0.8627553443175473

### Random Forest with Transformer Embedding

In [34]:
# Ensure data is in array form

X_train_embeddings = np.array(X_train_tf)
X_val_embeddings = np.array(X_val_tf)
y_train = np.array(y_train_tf)
y_val = np.array(y_val_tf)

In [35]:
# Values for grid search
parameters_rf = {
    'n_estimators': [100, 200],     
    'max_depth': [None, 10, 20],      
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2]     
}

best_f1_rf = 0
best_params_rf = None

# Progression bar
total_combinations_rf = len(parameters_rf['n_estimators']) * len(parameters_rf['max_depth']) * len(parameters_rf['min_samples_split']) * len(parameters_rf['min_samples_leaf'])
pbar_rf = tqdm(total=total_combinations_rf)

# Perform grid search
for n_estimators, max_depth, min_samples_split, min_samples_leaf in product(parameters_rf['n_estimators'], parameters_rf['max_depth'], parameters_rf['min_samples_split'], parameters_rf['min_samples_leaf']):

    pbar_rf.update(1)

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=0)
    rf.fit(X_train_embeddings, y_train)
    
    # Make predictions
    rf_pred = rf.predict(X_val_embeddings)
    f1_rf = f1_score(y_val, rf_pred, average='weighted')
    
    # Check if it's the best score
    if f1_rf > best_f1_rf:
        best_f1_rf = f1_rf
        best_params_rf = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}
        best_rf_tf = rf  # Save the best model

pbar_rf.close()

print(f"Best F1 Score: {best_f1_rf}")
print(f"Best Hyperparameters: {best_params_rf}")

# Evaluate the best model on validation data
print("F1 score on validation set with best parameters for Random Forest:", best_rf_tf.score(X_val_embeddings, y_val))
print("Classification Report for Random Forest:")
print(classification_report(y_val, rf_pred))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_val, rf_pred))

f1_rf = f1_score(y_val, rf_pred, average='weighted')
print("F1 score on validation set for Random Forest:", f1_rf)

100%|██████████| 36/36 [31:13<00:00, 52.04s/it]

Best F1 Score: 0.8908299716215484
Best Hyperparameters: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}
F1 score on validation set with best parameters for Random Forest: 0.8892445582586428
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      1135
           1       0.76      0.84      0.80       427

    accuracy                           0.89      1562
   macro avg       0.85      0.87      0.86      1562
weighted avg       0.89      0.89      0.89      1562

Confusion Matrix for Random Forest:
[[1023  112]
 [  67  360]]
F1 score on validation set for Random Forest: 0.8871125898806071





In [36]:
# Best F1: 0.8871125898806071 ; For: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}

## Predictions on Test
##### Using the model with the best score in validation -> Random Forest with transformer embedding

In [37]:
# Ensure the merged datasets are DataFrames
if isinstance(X_train_merged, np.ndarray):
    X_train_merged = pd.DataFrame(X_train_merged)
if isinstance(X_val_merged, np.ndarray):
    X_val_merged = pd.DataFrame(X_val_merged)
if isinstance(y_train, np.ndarray):
    y_train = pd.Series(y_train)
if isinstance(y_val, np.ndarray):
    y_val = pd.Series(y_val)

In [38]:
# Decided to train on the entire dataset for the predictions on test set
X_train_combined = pd.concat([X_train_merged, X_val_merged])
y_train_combined = pd.concat([y_train, y_val])

# Use model with best score from validation
best_rf_tf.fit(X_train_combined, y_train_combined)
test_predictions = best_rf_tf.predict(X_test_merged)

# Creating the output file
predictions_df = pd.DataFrame({'id': range(1, len(test_predictions) + 1), 'predicted': test_predictions})
predictions_df.to_csv('Predictions_04.csv', index=False)

In [None]:
########## End of the notebook #############

In [None]:
######### End of the notebook ##############