In [None]:
import pandas as pd
import numpy as np

```
BnPC Whole:     3,426 ; 38.99% Para
                5,361 ; 61.01% Non Para

BnPC Test:       Total: 1394
                 Number of pairs with label 0: 823, Total% 9.37%, nonpara% 15.35%
                 Number of pairs with label 1: 571, Total% 6.50%, para% 16.67%
                 Percentage of label 0 data: 59.04%
                 Percentage of label 1 data: 40.96%

BnPC Train:      Total: 5919
                 Number of pairs with label 0: 3666, 41.72%, nonpara% 68.38%
                 Number of pairs with label 1: 2253, 25.64%, para% 65.76%
                 Percentage of label 0 data: 61.94%
                 Percentage of label 1 data: 38.06%

BnPC Validation: Total: 1474
                 Number of pairs with label 0: 872, 9.92%, nonpara% 16.27%
                 Number of pairs with label 1: 602, 6.85%, para% 17.57%
                 Percentage of label 0 data: 59.16%
                 Percentage of label 1 data: 40.84%

```

In [None]:
# Read the BUET_train.csv, BUET_test.csv, and BUET_val.csv files into DataFrames
train_df = pd.read_csv('BUET_train.csv', encoding='utf-8')  # Specify the encoding
test_df = pd.read_csv('BUET_test.csv', encoding='utf-8')    # Specify the encoding
val_df = pd.read_csv('BUET_val.csv', encoding='utf-8')      # Specify the encoding


In [None]:
# Rename the "source" column to "sentence1" and the "target" column to "sentence2" for all DataFrames
train_df.rename(columns={'source': 'sentence1', 'target': 'sentence2'}, inplace=True)
test_df.rename(columns={'source': 'sentence1', 'target': 'sentence2'}, inplace=True)
val_df.rename(columns={'source': 'sentence1', 'target': 'sentence2'}, inplace=True)

In [None]:

# Add a new column named "label" with all values set to 1 for all DataFrames
train_df['label'] = 1
test_df['label'] = 1
val_df['label'] = 1

In [None]:

# Save the modified DataFrames back to CSV files
train_df.to_csv('BUET_train_modified.csv', index=False)  # Save the modified train DataFrame
test_df.to_csv('BUET_test_modified.csv', index=False)    # Save the modified test DataFrame
val_df.to_csv('BUET_val_modified.csv', index=False)      # Save the modified validation DataFrame

In [None]:
train_df = pd.read_csv('BUET_train_modified.csv', encoding='utf-8')  # Specify the encoding
test_df = pd.read_csv('BUET_test_modified.csv', encoding='utf-8')    # Specify the encoding
val_df = pd.read_csv('BUET_val_modified.csv', encoding='utf-8')      # Specify the encoding

In [None]:


# Load BUET_train_modified.csv and BnPC_train.csv
buet_train_modified = pd.read_csv('BUET_train_modified.csv')
bnpc_train = pd.read_csv('BnPC_train.csv')



# Repeat the process three times
for i in range(1, 4):
    # Initialize an empty DataFrame to store the final result
    final_df = pd.DataFrame(columns=buet_train_modified.columns)

    # Randomly sample 2253 rows from BUET_train_modified.csv
    buet_sample = buet_train_modified.sample(n=2253, random_state=i)

    # Filter BnPC_train.csv where label is 0 and randomly sample 3666 rows
    bnpc_filtered = bnpc_train[bnpc_train['label'] == 0]
    bnpc_sample = bnpc_filtered.sample(n=3666, random_state=i)

    # Concatenate the two samples
    combined_df = pd.concat([buet_sample, bnpc_sample], ignore_index=True)

    # Append the combined data to the final DataFrame
    final_df = final_df.append(combined_df, ignore_index=True)

    # Drop the used rows from BUET_train_modified.csv
    buet_train_modified = buet_train_modified.drop(buet_sample.index)

    # Print progress
    print(f"Processed BUET_train_bnpc_{i}.csv")

    final_df.to_csv(f'BUET_train_bnpc_{i}.csv', index=False)

  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)


Processed BUET_train_bnpc_1.csv
Processed BUET_train_bnpc_2.csv


  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)


Processed BUET_train_bnpc_3.csv
Processed BUET_train_bnpc_4.csv


  final_df = final_df.append(combined_df, ignore_index=True)


Processed BUET_train_bnpc_5.csv


In [None]:

# Load BUET_test_modified.csv and BnPC_test.csv
buet_test_modified = pd.read_csv('BUET_test_modified.csv')
bnpc_test = pd.read_csv('BnPC_test.csv')

# Initialize an empty DataFrame to store the final result
final_df = pd.DataFrame(columns=buet_test_modified.columns)

# Repeat the process five times
for i in range(1, 6):
    # Randomly sample 571 rows from BUET_test_modified.csv
    buet_sample = buet_test_modified.sample(n=571, random_state=i)

    # Filter BnPC_test.csv where label is 0 and randomly sample 823 rows
    bnpc_filtered = bnpc_test[bnpc_test['label'] == 0]
    bnpc_sample = bnpc_filtered.sample(n=823, random_state=i)

    # Concatenate the two samples
    combined_df = pd.concat([buet_sample, bnpc_sample], ignore_index=True)

    # Append the combined data to the final DataFrame
    final_df = final_df.append(combined_df, ignore_index=True)

    # Drop the used rows from BUET_test_modified.csv
    buet_test_modified = buet_test_modified.drop(buet_sample.index)

    # Print progress
    print(f"Processed BUET_test_bnpc_{i}.csv")

    final_df.to_csv(f'BUET_test_bnpc_{i}.csv', index=False)

Processed BUET_test_bnpc_1.csv
Processed BUET_test_bnpc_2.csv
Processed BUET_test_bnpc_3.csv
Processed BUET_test_bnpc_4.csv
Processed BUET_test_bnpc_5.csv


  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)


In [None]:

# Load BUET_val_modified.csv and BnPC_val.csv
buet_val_modified = pd.read_csv('BUET_val_modified.csv')
bnpc_val = pd.read_csv('BnPC_val.csv')

# Initialize an empty DataFrame to store the final result
final_df = pd.DataFrame(columns=buet_val_modified.columns)

# Repeat the process five times
for i in range(1, 6):
    # Randomly sample 602 rows from BUET_val_modified.csv
    buet_sample = buet_val_modified.sample(n=602, random_state=i)

    # Filter BnPC_val.csv where label is 0 and randomly sample 872 rows
    bnpc_filtered = bnpc_val[bnpc_val['label'] == 0]
    bnpc_sample = bnpc_filtered.sample(n=872, random_state=i)

    # Concatenate the two samples
    combined_df = pd.concat([buet_sample, bnpc_sample], ignore_index=True)

    # Append the combined data to the final DataFrame
    final_df = final_df.append(combined_df, ignore_index=True)

    # Drop the used rows from BUET_val_modified.csv
    buet_val_modified = buet_val_modified.drop(buet_sample.index)

    # Print progress
    print(f"Processed BUET_val_bnpc_{i}.csv")

    final_df.to_csv(f'BUET_val_bnpc_{i}.csv', index=False)

  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)
  final_df = final_df.append(combined_df, ignore_index=True)


Processed BUET_val_bnpc_1.csv
Processed BUET_val_bnpc_2.csv
Processed BUET_val_bnpc_3.csv
Processed BUET_val_bnpc_4.csv
Processed BUET_val_bnpc_5.csv
