In [7]:
import gc

import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from lightgbm import early_stopping
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

## Aggregating Preprocessed Data for Training

In [8]:
def string_to_array(embedding_str):
    """
    Converting string representation of embedding to numpy array.
    Handles newlines and scientific notation.
    """
    # Removing brackets and newlines
    cleaned = embedding_str.strip('[]').replace('\n', ' ')
    return np.array([float(x) for x in cleaned.split()])


def process_large_dataset(filepath, chunk_size=10000, columns_to_load=None, mode='train'):
    """
    Processing large training or test CSV files in chunks.

    Args:
        filepath (str): Path to the CSV file
        chunk_size (int): Number of rows to process in each chunk
        columns_to_load (list): Columns to load from the CSV file
    """

    # Getting embedding dimensions from first row
    first_chunk = next(pd.read_csv(filepath, nrows=1, usecols=['GloVe_Embedding', 'BERT_Embedding'], chunksize=1))
    glove_dim = len(string_to_array(first_chunk['GloVe_Embedding'].iloc[0]))
    bert_dim = len(string_to_array(first_chunk['BERT_Embedding'].iloc[0]))
    # Initializing aggregators
    running_stats = {
        'sums': {},  # Storing sums for mean calculations
        'total_counts': {},  # Storing counts for mean calculations
        'first_values': {}  # Storing first occurrences for period-wise constant features
    }

    # Reading and processing the CSV file in chunks
    print("Processing chunks...")
    chunks_iterator = pd.read_csv(filepath, chunksize=chunk_size, usecols=columns_to_load)

    for chunk in tqdm(chunks_iterator):

        # Converting embeddings strings to arrays before processing
        chunk['GloVe_Embedding'] = chunk['GloVe_Embedding'].apply(string_to_array)
        chunk['BERT_Embedding'] = chunk['BERT_Embedding'].apply(string_to_array)

        # Processing each ID in the chunk
        for id_group, group in chunk.groupby(['ID', 'PeriodID']):
            # Initializing if this ID hasn't been seen before
            if id_group not in running_stats['sums']:
                running_stats['sums'][id_group] = {
                    'Sentiment_joy': 0,
                    'Sentiment_anger': 0,
                    'Sentiment_fear': 0,
                    'Sentiment_sadness': 0,
                    'Sentiment_surprise': 0,
                    'Sentiment_Score': 0,
                    'Exclamation_Count': 0,
                    'Question_Count': 0,
                    'Uppercase_Ratio': 0,
                    'Repeated_Char_Word_Ratio': 0,
                    'Gives_Score': 0,
                    'GloVe_Embedding': np.zeros(glove_dim),
                    'BERT_Embedding': np.zeros(bert_dim)
                }
                running_stats['first_values'][id_group] = {
                    'Is_Key_Period': group['Is_Key_Period'].iloc[0],
                    'EventType': group['EventType'].iloc[0] if mode == 'train' else None,
                    'PeriodID': group['PeriodID'].iloc[0],
                    'ID': group['ID'].iloc[0]
                }
                running_stats['total_counts'][id_group] = 0

            # Updating sums for mean calculations
            n = len(group)
            running_stats['total_counts'][id_group] += n

            # Updating sums for each metric
            running_stats['sums'][id_group]['Sentiment_joy'] += group['Sentiment_joy'].sum()
            running_stats['sums'][id_group]['Sentiment_anger'] += group['Sentiment_anger'].sum()
            running_stats['sums'][id_group]['Sentiment_fear'] += group['Sentiment_fear'].sum()
            running_stats['sums'][id_group]['Sentiment_sadness'] += group['Sentiment_sadness'].sum()
            running_stats['sums'][id_group]['Sentiment_surprise'] += group['Sentiment_surprise'].sum()
            running_stats['sums'][id_group]['Sentiment_Score'] += group['Sentiment_Score'].sum()
            running_stats['sums'][id_group]['Exclamation_Count'] += group['Exclamation_Count'].sum()
            running_stats['sums'][id_group]['Question_Count'] += group['Question_Count'].sum()
            running_stats['sums'][id_group]['Uppercase_Ratio'] += group['Uppercase_Ratio'].sum()
            running_stats['sums'][id_group]['Repeated_Char_Word_Ratio'] += group['Repeated_Char_Word_Ratio'].sum()
            running_stats['sums'][id_group]['Gives_Score'] += group['Gives_Score'].sum()

            # Updating embedding sums
            running_stats['sums'][id_group]['GloVe_Embedding'] += np.sum(np.vstack(group['GloVe_Embedding']), axis=0)
            running_stats['sums'][id_group]['BERT_Embedding'] += np.sum(np.vstack(group['BERT_Embedding']), axis=0)

        # Forcing garbage collection after each chunk
        gc.collect()

    # Computing final aggregated results
    print("Computing final aggregations...")
    result_data = []

    for id_group in running_stats['total_counts'].keys():
        count = running_stats['total_counts'][id_group]

        result_dict = {
            'ID': running_stats['first_values'][id_group]['ID'],
            'PeriodID': running_stats['first_values'][id_group]['PeriodID'],
            'Tweet_Count': count,
            'Is_Key_Period': running_stats['first_values'][id_group]['Is_Key_Period'],
            'EventType': running_stats['first_values'][id_group]['EventType'],
            'Sentiment_joy': running_stats['sums'][id_group]['Sentiment_joy'] / count,
            'Sentiment_anger': running_stats['sums'][id_group]['Sentiment_anger'] / count,
            'Sentiment_fear': running_stats['sums'][id_group]['Sentiment_fear'] / count,
            'Sentiment_sadness': running_stats['sums'][id_group]['Sentiment_sadness'] / count,
            'Sentiment_surprise': running_stats['sums'][id_group]['Sentiment_surprise'] / count,
            'Sentiment_Score': running_stats['sums'][id_group]['Sentiment_Score'] / count,
            'Exclamation_Count': running_stats['sums'][id_group]['Exclamation_Count'],
            'Question_Count': running_stats['sums'][id_group]['Question_Count'],
            'Uppercase_Ratio': running_stats['sums'][id_group]['Uppercase_Ratio'] / count,
            'Repeated_Char_Word_Ratio': running_stats['sums'][id_group]['Repeated_Char_Word_Ratio'] / count,
            'Gives_Score': running_stats['sums'][id_group]['Gives_Score'],
            'GloVe_Embedding': running_stats['sums'][id_group]['GloVe_Embedding'] / count,
            'BERT_Embedding': running_stats['sums'][id_group]['BERT_Embedding'] / count
        }

        result_data.append(result_dict)

    # Creating final DataFrame
    aggregated_df = pd.DataFrame(result_data)

    return aggregated_df


def expand_embeddings(aggregated_df):
    """
    Expanding embedding features into separate columns.
    """
    print("Expanding embeddings...")

    # Getting embedding dimensions
    bert_embedding_dim = aggregated_df['BERT_Embedding'].iloc[0].shape[0]
    glove_embedding_dim = aggregated_df['GloVe_Embedding'].iloc[0].shape[0]

    # Creating column names
    bert_columns = [f'BERT_{i}' for i in range(bert_embedding_dim)]
    glove_columns = [f'GloVe_{i}' for i in range(glove_embedding_dim)]

    # Converting embeddings to DataFrames efficiently
    bert_features = pd.DataFrame(
        np.stack(aggregated_df['BERT_Embedding'].values),
        columns=bert_columns
    )

    glove_features = pd.DataFrame(
        np.stack(aggregated_df['GloVe_Embedding'].values),
        columns=glove_columns
    )

    # Combining DataFrames
    expanded_df = pd.concat([
        aggregated_df.drop(columns=['BERT_Embedding', 'GloVe_Embedding']),
        bert_features,
        glove_features
    ], axis=1)

    return expanded_df, bert_columns, glove_columns

In [9]:
def prepare_train_and_test_data(expanded_df_train, expanded_df_test, bert_columns, glove_columns, n_components=0.96):
    """
    Preparing data for model training with dimensionality reduction and scaling.
    """
    print("Preparing data for training...")

    columns_to_drop = ['ID', 'Sentiment_fear', 'EventType'] + glove_columns
    X_test = expanded_df_test.drop(columns=columns_to_drop)

    X_train_full = expanded_df_train.drop(columns=columns_to_drop)
    y_train_full = expanded_df_train['EventType']

    # Performing train-validation split
    X_train_val, X_val, y_train_val, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.3, random_state=42, stratify=y_train_full
    )

    # Dimensionality Reduction for BERT Columns
    print("Reducing dimensionality of BERT features...")
    pca_train_val = PCA(n_components=n_components)
    pca_train_full = PCA(n_components=n_components)

    # Applying PCA to train, validation, and test sets
    bert_train_val = X_train_val[bert_columns]
    bert_train_full = X_train_full[bert_columns]
    bert_test = X_test[bert_columns]

    bert_train_val_reduced = pca_train_val.fit_transform(bert_train_val)
    bert_train_full_reduced = pca_train_full.fit_transform(bert_train_full)
    bert_val_reduced = pca_train_val.transform(X_val[bert_columns])
    bert_test_reduced = pca_train_full.transform(bert_test)

    # Replace original BERT columns with reduced features
    bert_train_val_reduced_columns = [f'BERT_PCA_{i}' for i in range(len(pca_train_val.explained_variance_ratio_))]
    bert_train_full_reduced_columns = [f'BERT_PCA_{i}' for i in range(len(pca_train_full.explained_variance_ratio_))]
    bert_train_val_df = pd.DataFrame(bert_train_val_reduced, columns=bert_train_val_reduced_columns,
                                     index=X_train_val.index)
    bert_train_full_df = pd.DataFrame(bert_train_full_reduced, columns=bert_train_full_reduced_columns,
                                      index=X_train_full.index)
    bert_val_df = pd.DataFrame(bert_val_reduced, columns=bert_train_val_reduced_columns, index=X_val.index)
    bert_test_df = pd.DataFrame(bert_test_reduced, columns=bert_train_full_reduced_columns, index=X_test.index)

    X_train_full = pd.concat([X_train_full.drop(columns=bert_columns), bert_train_full_df], axis=1)
    X_train_val = pd.concat([X_train_val.drop(columns=bert_columns), bert_train_val_df], axis=1)
    X_val = pd.concat([X_val.drop(columns=bert_columns), bert_val_df], axis=1)
    X_test = pd.concat([X_test.drop(columns=bert_columns), bert_test_df], axis=1)

    # Scaling Features
    print("Scaling features...")
    columns_not_to_scale = ['Is_Key_Period']
    columns_to_scale = [col for col in X_train_val.columns if col not in columns_not_to_scale]

    scaler = StandardScaler()
    scaler_x_train_full = StandardScaler()
    X_train_full[columns_to_scale] = scaler_x_train_full.fit_transform(X_train_full[columns_to_scale])
    X_test[columns_to_scale] = scaler_x_train_full.transform(X_test[columns_to_scale])
    X_train_val[columns_to_scale] = scaler.fit_transform(X_train_val[columns_to_scale])
    X_val[columns_to_scale] = scaler.transform(X_val[columns_to_scale])

    return X_train_val, X_val, y_train_val, y_val, X_train_full, y_train_full, X_test

In [10]:
train_filepath = "backup_data/train_preprocessed_data.csv"
test_filepath = "backup_data/test_preprocessed_data.csv"

In [11]:
columns_to_load = ['ID', 'PeriodID', 'Sentiment_Score', 'Sentiment_anger',
                   'Sentiment_fear', 'Sentiment_joy', 'Sentiment_sadness', 'Sentiment_surprise',
                   'Exclamation_Count', 'Question_Count', 'Uppercase_Ratio', 'Repeated_Char_Word_Ratio',
                   'Is_Key_Period', 'Gives_Score', 'BERT_Embedding', 'GloVe_Embedding']

In [12]:
aggregated_df_train = process_large_dataset(train_filepath, columns_to_load=columns_to_load + ['EventType'])

Processing chunks...


190it [06:57,  2.20s/it]

Computing final aggregations...





In [13]:
aggregated_df_test = process_large_dataset(test_filepath, columns_to_load=columns_to_load, mode='test')

Processing chunks...


44it [01:35,  2.17s/it]

Computing final aggregations...





In [14]:
aggregated_df_train.to_csv("backup_data/aggregated_df_train.csv", index=False)

In [15]:
aggregated_df_test.to_csv("backup_data/aggregated_df_test.csv", index=False)

In [16]:
# Expanding embeddings
expanded_df_train, bert_columns, glove_columns = expand_embeddings(aggregated_df_train)
expanded_df_test, _, _ = expand_embeddings(aggregated_df_test)

Expanding embeddings...
Expanding embeddings...


In [17]:
# Preparing data for training
X_train_val, X_val, y_train_val, y_val, X_train_full, y_train_full, X_test = prepare_train_and_test_data(
    expanded_df_train, expanded_df_test, bert_columns, glove_columns
)

Preparing data for training...
Reducing dimensionality of BERT features...
Scaling features...


In [18]:
X_train_val.shape, X_val.shape, X_train_full.shape, X_test.shape

((1495, 52), (642, 52), (2137, 52), (516, 52))

In [19]:
X_train_val

Unnamed: 0,PeriodID,Tweet_Count,Is_Key_Period,Sentiment_joy,Sentiment_anger,Sentiment_sadness,Sentiment_surprise,Sentiment_Score,Exclamation_Count,Question_Count,...,BERT_PCA_29,BERT_PCA_30,BERT_PCA_31,BERT_PCA_32,BERT_PCA_33,BERT_PCA_34,BERT_PCA_35,BERT_PCA_36,BERT_PCA_37,BERT_PCA_38
990,-0.669029,0.756789,0,1.086449,1.040827,-0.581751,-0.048171,0.103142,0.066924,0.614910,...,-0.734746,0.165645,-0.269700,0.189554,-0.404247,-0.864502,-0.721018,0.657024,-0.158007,-0.213906
1154,0.142062,-0.883152,0,-1.500492,-0.064170,-1.725857,-1.955914,0.058652,-0.599646,-0.765917,...,2.425131,-0.712714,-1.562910,-0.839388,2.583584,-0.228995,-1.313845,-1.555132,-1.861253,-0.045465
135,-1.529277,-0.488156,1,0.980022,-0.968852,0.064019,-0.830190,-0.293541,-0.264435,-0.188855,...,0.172768,0.283268,0.349023,0.176919,0.265565,-0.956540,-0.041431,-1.767232,0.000158,0.385434
2117,1.051467,1.452735,1,2.548025,-0.677055,-0.644545,0.745290,-0.278491,1.719863,0.965269,...,0.570673,-0.300139,-1.459794,1.299573,1.598151,-1.363390,0.205022,-0.835523,-0.335301,-0.337912
1741,2.624492,0.887279,0,3.855980,0.296548,0.502003,0.661358,-0.049776,1.305665,0.243942,...,0.410861,-0.160930,-0.679532,0.583262,2.360326,-0.131008,0.265919,-0.586917,-0.064968,0.368721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,0.191219,-0.711517,0,-0.820480,1.001176,0.580035,-1.479330,1.801595,-0.620838,-0.528909,...,-0.350292,0.611932,0.727917,-1.375103,-0.056598,-0.072674,-1.255262,1.639499,0.299046,0.668057
42,-0.496979,-0.458766,1,1.294525,-1.399217,-0.899871,0.781274,-0.710575,-0.162330,-0.549519,...,0.354023,-1.793089,0.244080,0.288398,-2.603467,-1.082955,1.236502,1.662387,1.098130,1.126393
1270,-0.177458,0.197210,1,-0.174809,-0.032300,-0.514685,-0.022723,0.042111,-0.181595,0.398512,...,0.758396,0.604585,0.099973,0.178741,0.862161,1.093153,1.147147,-0.524899,1.071210,0.150822
1724,2.206657,1.686677,0,0.589970,1.494729,0.288221,1.542877,-0.048962,3.561599,0.728262,...,-0.904411,0.181524,-1.270410,-0.557738,0.134887,0.796254,0.431382,-0.113628,0.698378,0.004296


### Baseline models

In [20]:
def get_cross_val_scores(model, X, y):
    """
    Get cross-validation scores for a given model.
    """
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Mean Cross validation Accuracy: {scores.mean()}")
    print("Individual Cross validation scores: ", scores)


# Evaluation Function
def evaluate_model(model, X, y):
    """Evaluate a trained model on provided data."""
    predictions = model.predict(X)
    #  if hasattr(model, "predict_proba"):  # Check if model supports probability predictions
    #      probabilities = model.predict_proba(X)[:, 1]
    #  else:
    #     probabilities = None

    # Metrics
    print("Results:")
    print(classification_report(y, predictions))
    #print(f"Accuracy: {accuracy_score(y, predictions):.4f}")
    #print(f"F1 Score: {f1_score(y, predictions):.4f}")
    #if probabilities is not None:
    #   print(f"AUC-ROC: {roc_auc_score(y, probabilities):.4f}")

#  return predictions  #, probabilities

In [21]:
# Logistic Regression
print("Training Logistic Regression...")
logistic_model = LogisticRegression(
    max_iter=1000,
    penalty='l2',  # Default L2 regularization
    C=1.0,  # Regularization strength, lower values increase regularization
    random_state=42,
    class_weight='balanced'  # To account for minor imbalances (if applicable)
)

Training Logistic Regression...


In [22]:
get_cross_val_scores(logistic_model, X_train_val, y_train_val)

Mean Cross validation Accuracy: 0.7585284280936455
Individual Cross validation scores:  [0.77257525 0.76588629 0.77257525 0.75919732 0.72240803]


In [23]:
logistic_model.fit(X_train_val, y_train_val)

In [24]:
evaluate_model(logistic_model, X_train_val, y_train_val)

Results:
              precision    recall  f1-score   support

           0       0.73      0.81      0.77       688
           1       0.82      0.75      0.78       807

    accuracy                           0.78      1495
   macro avg       0.78      0.78      0.78      1495
weighted avg       0.78      0.78      0.78      1495



In [25]:
evaluate_model(logistic_model, X_val, y_val)

Results:
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       296
           1       0.84      0.76      0.80       346

    accuracy                           0.79       642
   macro avg       0.79      0.79      0.79       642
weighted avg       0.80      0.79      0.79       642



In [26]:
# Random Forest
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=500,  # Increase trees for better generalization
    max_depth=10,  # Limit depth to reduce overfitting
    min_samples_split=10,  # Minimum samples required to split
    min_samples_leaf=5,  # Minimum samples in a leaf
    max_features='sqrt',  # Random subset of features at each split
    class_weight='balanced',  # Handle class imbalance
    random_state=42
)


Training Random Forest...


In [27]:
get_cross_val_scores(rf_model, X_train_val, y_train_val)

Mean Cross validation Accuracy: 0.7652173913043478
Individual Cross validation scores:  [0.73244147 0.76254181 0.78595318 0.80267559 0.74247492]


In [28]:
rf_model.fit(X_train_val, y_train_val)

In [29]:
evaluate_model(rf_model, X_train_val, y_train_val)

Results:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       688
           1       0.96      0.94      0.95       807

    accuracy                           0.95      1495
   macro avg       0.94      0.95      0.95      1495
weighted avg       0.95      0.95      0.95      1495



In [30]:
evaluate_model(rf_model, X_val, y_val)

Results:
              precision    recall  f1-score   support

           0       0.77      0.80      0.78       296
           1       0.82      0.80      0.81       346

    accuracy                           0.80       642
   macro avg       0.80      0.80      0.80       642
weighted avg       0.80      0.80      0.80       642



In [31]:
# LightGBM
print("\nTraining LightGBM...")
#lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,  # Number of boosting iterations
    learning_rate=0.02,  # Lower learning rate for better generalization
    max_depth=6,  # Maximum depth of trees
    num_leaves=31,  # Default value, controls leaf count
    min_child_samples=30,  # Minimum data per leaf (regularization)
    min_child_weight=1e-3,  # Minimum sum of weights in a child
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=0.3,  # L2 regularization
    feature_fraction=0.8,  # Randomly select features for training
    bagging_fraction=0.8,  # Randomly select data for each tree
    bagging_freq=1,  # Perform bagging at every iteration
    random_state=42
)


Training LightGBM...


In [None]:
get_cross_val_scores(lgb_model, X_train_val, y_train_val)

In [None]:
# Add early stopping via callbacks
lgb_model.fit(
    X_train_val, y_train_val,
    eval_set=[(X_val, y_val)],  # Validation set
    eval_metric='logloss',  # Evaluation metric
    callbacks=[early_stopping(stopping_rounds=50)]  # Early stopping callback
)

In [34]:
evaluate_model(lgb_model, X_train_val, y_train_val)

Results:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       688
           1       0.99      0.98      0.98       807

    accuracy                           0.98      1495
   macro avg       0.98      0.98      0.98      1495
weighted avg       0.98      0.98      0.98      1495



In [35]:
evaluate_model(lgb_model, X_val, y_val)

Results:
              precision    recall  f1-score   support

           0       0.79      0.76      0.78       296
           1       0.80      0.83      0.81       346

    accuracy                           0.80       642
   macro avg       0.80      0.80      0.80       642
weighted avg       0.80      0.80      0.80       642



In [36]:
# XGBoost
print("\nTraining XGBoost...")

xgb_model = xgb.XGBClassifier(
    n_estimators=500,  # Maximum boosting rounds
    learning_rate=0.02,  # Lower learning rate for better generalization
    max_depth=6,  # Maximum tree depth
    min_child_weight=5,  # Minimum sum of weights of all observations in a child node
    colsample_bytree=0.8,  # Fraction of features used per tree
    subsample=0.8,  # Fraction of data used per boosting round
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=0.3,  # L2 regularization
    gamma=1,  # Minimum loss reduction required to make a split
    eval_metric='logloss',  # Logarithmic loss as the evaluation metric
    random_state=42
)


Training XGBoost...


In [37]:
get_cross_val_scores(xgb_model, X_train_val, y_train_val)

Mean Cross validation Accuracy: 0.785284280936455
Individual Cross validation scores:  [0.7826087  0.78929766 0.80267559 0.79264214 0.75919732]


In [38]:
xgb_model.fit(
    X_train_val, y_train_val,
    eval_set=[(X_val, y_val)],  # Use validation set for early stopping
    verbose=False
)

In [39]:
evaluate_model(xgb_model, X_train_val, y_train_val)

Results:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       688
           1       0.99      0.99      0.99       807

    accuracy                           0.99      1495
   macro avg       0.99      0.99      0.99      1495
weighted avg       0.99      0.99      0.99      1495



In [40]:
evaluate_model(xgb_model, X_val, y_val)

Results:
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       296
           1       0.81      0.83      0.82       346

    accuracy                           0.80       642
   macro avg       0.80      0.80      0.80       642
weighted avg       0.80      0.80      0.80       642



### Submission

#### Training on the whole training set before submission

In [59]:
best_model = rf_model

In [60]:
best_model.fit(
    X_train_full, y_train_full,
)

In [61]:
predictions = best_model.predict(X_test)
pred_df = pd.DataFrame({'ID': expanded_df_test['ID'], 'EventType': predictions})

In [62]:
# Sort the final DataFrame by the split components of ID
pred_df['ID_First'] = pred_df['ID'].str.split('_').str[0].astype(int)
pred_df['ID_Second'] = pred_df['ID'].str.split('_').str[1].astype(int)

pred_df = pred_df.sort_values(by=['ID_First', 'ID_Second']).reset_index(drop=True)

# Drop temporary sorting columns
pred_df.drop(columns=['ID_First', 'ID_Second'], inplace=True)

In [63]:
pred_df

Unnamed: 0,ID,EventType
0,6_0,0
1,6_1,0
2,6_2,0
3,6_3,0
4,6_4,1
...,...,...
511,16_125,1
512,16_126,1
513,16_127,1
514,16_128,1


In [52]:
submission_file = "best_xgb_predictions.csv"
pred_df.to_csv(submission_file, index=False)