## Stratified Splitting of Paraphrase Dataset

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

### Importing Dataset
There were 4 types of news headline

In [8]:
dataset_filenames = ["bangladesh.csv", "entertainment.csv", "sports.csv", "world.csv"]
data_path = "../../data"
datasets = []

for filename in dataset_filenames:
    dataset_path = os.path.join(data_path, filename)
    dataset = pd.read_csv(dataset_path)[["sentence1", "sentence2", "label"]]
    dataset['topic'] = filename.split('.')[0]
    dataset = dataset.dropna()
    datasets.append(dataset)

### Splitting The Datasets
Here we splitted the dataset with the similar ratio of paraphrase, not paraphrase. As well as the ratio of headline from different type is also similar.

Splitting ratio:
- Train: 70%
- Test: 15%
- Val: 15%

In [10]:
train, val = [], []

for dataset in datasets:
    X = dataset[["sentence1", "sentence2", "topic"]]
    y = dataset[["label"]]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                        stratify=y, 
                                                        test_size=0.2)
    
    X_train["label"] = y_train["label"].astype(int)
    X_val["label"] = y_val["label"].astype(int)
    
    train.append(X_train)
    val.append(X_val)

### Combining Different Types of Headlines

In [13]:
train_df = pd.concat(train, ignore_index=True)
val_df = pd.concat(val, ignore_index=True)
print(train_df.shape, val_df.shape)

(5913, 4) (1480, 4)


### Save Dataframe to CSV Files
File will be saved in the root directory

In [15]:
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)

In [36]:
para_train_df = val_df[val_df['label']==0]

topicwise = para_train_df[para_train_df['topic']=='entertainment']

topicwise.shape

(1480, 4)