In [None]:
# Compiling and splitting Mix Data set into a larger Training and smaller Testing Set
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import load_data, clean_unnecessary_spaces


# Processing Google Data
training_df = pd.read_csv("original-data/mix/google_train.tsv", sep='\t').astype(str)
testing_df = pd.read_csv("original-data/mix/google_test.tsv", sep='\t').astype(str)

training_df = training_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
testing_df = testing_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

training_df = training_df[["input_text", "target_text"]]
testing_df = testing_df[["input_text", "target_text"]]

training_df["prefix"] = "paraphrase"
testing_df["prefix"] = "paraphrase"


# Processing Microsoft data
# Concatenating Microsoft to Google data
training_df = pd.concat(
    [
        training_df,
        load_data("original-data/mix/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
testing_df = pd.concat(
    [
        testing_df,
        load_data("original-data/mix/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)


#Processing Quora data
temp_df = load_data("original-data/mix/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate")
q_train, q_test = train_test_split(temp_df)

# Concatenating Quora to the Microsoft and Google data
training_df = pd.concat(
    [
        training_df, 
        q_train
    ]
)
testing_df = pd.concat(
    [
        testing_df, q_test
    ]
)


# Cleaning and Organizing testing and training data set
training_df = training_df[["prefix", "input_text", "target_text"]]
testing_df = testing_df[["prefix", "input_text", "target_text"]]

training_df = training_df.dropna()
testing_df = testing_df.dropna()

training_df["input_text"] = training_df["input_text"].apply(clean_unnecessary_spaces)
training_df["target_text"] = training_df["target_text"].apply(clean_unnecessary_spaces)

testing_df["input_text"] = testing_df["input_text"].apply(clean_unnecessary_spaces)
testing_df["target_text"] = testing_df["target_text"].apply(clean_unnecessary_spaces)


# Ensures directory exists for save location
path = 'compiled-data/mix'
if not os.path.exists(path):
    os.mkdir(path)

# Uploading mix data set to training and testing tsv files
training_df.to_csv("compiled-data/mix/mix-training_data.tsv", sep="\t")
testing_df.to_csv("compiled-data/mix/mix-testing_data.tsv", sep="\t")

In [None]:
# Opens both parts of the original data set saved in txt form
# Combines the two parts into one file for later compilation

with open('original-data/twitter/URL_data_2017_prob_1.txt') as fp:
    part1 = fp.read()
with open('original-data/twitter/URL_data_2017_prob_2.txt') as fp:
    part2 = fp.read()

total = part1 + '\n' + part2
  
with open('original-data/twitter/URL_data_2017_prob.txt', 'w') as fp:
    fp.write(total)

In [None]:
# Processing Twitter data

import os
import pandas as pd

# Filters Twitter data based on the PWI quality of the paraphrasing pair
training_df = pd.read_csv("original-data/twitter/URL_data_2017_prob.txt", on_bad_lines='skip', sep='\t').astype(str)
training_df.columns = ['quality', 'input_text', 'target_text']

to_drop = []
for index, row in training_df.iterrows():
    qual = float(row.quality)
    if qual >= 0.825:
        continue
    else:
        to_drop.append(index)
training_df = training_df.drop(to_drop)


# Ensures directory exists for save location
path = 'compiled-data/twit0.825'
if not os.path.exists(path):
    os.mkdir(path)

# Uploading Twitter set with PWI quality greater than or equal to 0.825 to a tsv file
training_df.to_csv('compiled-data/twit0.825/twit0.825-data.tsv', sep="\t")