In [None]:
import pandas as pd
import re
from google.colab import drive
drive.mount('/content/drive')

# Data Loading

In [None]:
df_1 = pd.read_csv('/content/sql_create_context.csv')
df_2 = pd.read_csv('/content/synthetic_text_to_sql.csv')

# Processing

Add `;` to dataset 1

In [None]:
df_1['context'] = df_1['context'] + ";"
df_1['answer'] = df_1['answer'] + ";"

Remove `INSERT INTO` statements in dataset 2

In [None]:
def remove_insert_statements(sql):
    return re.sub(r'INSERT INTO.*?;', '', sql, flags=re.DOTALL)

df_2['sql_context'] = df_2['sql_context'].apply(remove_insert_statements)

Now, we merge both datasets

In [None]:
df_1.columns = ['prompt','context','answer']
df_2.columns = ['prompt','context','answer']

df = pd.concat([df_1, df_2], ignore_index=True)

df = df.sample(frac=1).reset_index(drop=True)

df = df.loc[~df['context'].isin([""," "])]

# Create train, validation and test datasets

In [None]:
from sklearn.model_selection import train_test_split

# 75% for training
train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42)

# 15% for validation, 10% for testing
validation_df, test_df = train_test_split(temp_df, test_size=0.4, random_state=42)

print("Training set size:", len(train_df))
print("Validation set size:", len(validation_df))
print("Test set size:", len(test_df))

Training set size: 133920
Validation set size: 26784
Test set size: 17856


# Storing

In [None]:
validation_df.to_csv('validation.csv', index=False)

In [None]:
test_df.to_csv('test.csv', index=False)

In [None]:
train_df.to_csv('train.csv', index=False)

In [None]:
!cp train.csv "/content/drive/My Drive/text2sql_data/"
!cp test.csv "/content/drive/My Drive/text2sql_data/"
!cp validation.csv "/content/drive/My Drive/text2sql_data/"