In [None]:
#Compiling and Splitting Mixed Datasets into Training and Testing Sets

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import load_data, clean_unnecessary_spaces


#Processing Google Data
training_df = pd.read_csv("original-data/mix/google_train.tsv", sep='\t').astype(str)
evaluation_df = pd.read_csv("original-data/mix/google_test.tsv", sep='\t').astype(str)

training_df = training_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
evaluation_df = evaluation_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

training_df = training_df[["input_text", "target_text"]]
evaluation_df = evaluation_df[["input_text", "target_text"]]

training_df["prefix"] = "paraphrase"
evaluation_df["prefix"] = "paraphrase"


#Processing Microsoft Data
training_df = pd.concat(
    [
        training_df,
        load_data("original-data/mix/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
evaluation_df = pd.concat(
    [
        evaluation_df,
        load_data("original-data/mix/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)


#Processing Quora Data
temp_df = load_data("original-data/mix/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate")

q_train, q_test = train_test_split(temp_df)

training_df = pd.concat([training_df, q_train])
evaluation_df = pd.concat([evaluation_df, q_test])


#Cleaning and Organizing Datasets
training_df = training_df[["prefix", "input_text", "target_text"]]
evaluation_df = evaluation_df[["prefix", "input_text", "target_text"]]

training_df = training_df.dropna()
evaluation_df = evaluation_df.dropna()

training_df["input_text"] = training_df["input_text"].apply(clean_unnecessary_spaces)
training_df["target_text"] = training_df["target_text"].apply(clean_unnecessary_spaces)

evaluation_df["input_text"] = evaluation_df["input_text"].apply(clean_unnecessary_spaces)
evaluation_df["target_text"] = evaluation_df["target_text"].apply(clean_unnecessary_spaces)

path = 'compiled-data/mix'

if not os.path.exists(path):
    os.mkdir(path)

#Uploading Datasets to Files
training_df.to_csv("compiled-data/mix/mix-training_data.tsv", sep="\t")
evaluation_df.to_csv("compiled-data/mix/mix-testing_data.tsv", sep="\t")

In [27]:
with open('original-data/twitter/URL_data_2017_prob_1.txt') as fp:
    part1 = fp.read()
with open('original-data/twitter/URL_data_2017_prob_2.txt') as fp:
    part2 = fp.read()

total = part1 + '\n' + part2
  
with open('original-data/twitter/URL_data_2017_prob.txt', 'w') as fp:
    fp.write(total)

In [15]:
#Moving twitter dataset from txt to tsv

import os
import pandas as pd

training_df = pd.read_csv("original-data/twitter/URL_data_2017_prob.txt", on_bad_lines='skip', sep='\t').astype(str)

training_df.columns = ['quality', 'input_text', 'target_text']

to_drop = []
for index, row in training_df.iterrows():
    qual = float(row.quality)
    if qual >= 0.825:
        continue
    else:
        to_drop.append(index)

training_df = training_df.drop(to_drop)

path = 'compiled-data/tw'

if not os.path.exists(path):
    os.mkdir(path)

training_df.to_csv('compiled-data/twit0.825/twit0.825-data.tsv', sep="\t")