In [None]:
#Compiling and Splitting Mixed Datasets into Training and Testing Sets

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import load_data, clean_unnecessary_spaces


#Processing Google Data
training_df = pd.read_csv("original-data/mix-data/google_train.tsv", sep='\t').astype(str)
evaluation_df = pd.read_csv("original-data/mix-data/google_test.tsv", sep='\t').astype(str)

training_df = training_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
evaluation_df = evaluation_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

training_df = training_df[["input_text", "target_text"]]
evaluation_df = evaluation_df[["input_text", "target_text"]]

training_df["prefix"] = "paraphrase"
evaluation_df["prefix"] = "paraphrase"


#Processing Microsoft Data
training_df = pd.concat(
    [
        training_df,
        load_data("original-data/mix-data/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
evaluation_df = pd.concat(
    [
        evaluation_df,
        load_data("original-data/mix-data/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)


#Processing Quora Data
temp_df = load_data("original-data/mix-data/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate")

q_train, q_test = train_test_split(temp_df)

q_train.to_csv("original-data/mix-data/quora_train.tsv", sep="\t")
q_test.to_csv("original-data/mix-data/quora_test.tsv", sep="\t")

training_df = pd.concat([training_df, q_train])
evaluation_df = pd.concat([evaluation_df, q_test])


#Cleaning and Organizing Datasets
training_df = training_df[["prefix", "input_text", "target_text"]]
evaluation_df = evaluation_df[["prefix", "input_text", "target_text"]]

training_df = training_df.dropna()
evaluation_df = evaluation_df.dropna()

training_df["input_text"] = training_df["input_text"].apply(clean_unnecessary_spaces)
training_df["target_text"] = training_df["target_text"].apply(clean_unnecessary_spaces)

evaluation_df["input_text"] = evaluation_df["input_text"].apply(clean_unnecessary_spaces)
evaluation_df["target_text"] = evaluation_df["target_text"].apply(clean_unnecessary_spaces)


#Uploading Datasets to Files
training_df.to_csv("compiled-data/mix/mix-training_data.tsv", sep="\t")
evaluation_df.to_csv("compiled-data/mix/mix-testing_data.tsv", sep="\t")

In [3]:
#Moving twitter dataset from txt to tsv

import os
import pandas as pd

training_df = pd.read_csv("original-data/twitter-data/URL_data_2017_prob.txt", on_bad_lines='skip', sep='\t').astype(str)

training_df.columns = ['quality', 'input_text', 'target_text']

to_drop = []
for index, row in training_df.iterrows():
    if (index % 10000 == 0):
        print("Row : " + str(index))
        
    qual = float(row.quality)
    if qual >= 0.825:
        continue
    else:
        to_drop.append(index)

training_df = training_df.drop(to_drop)

training_df.to_csv('compiled-data/twit0.825/twit0.825-data.tsv', sep="\t")

Row : 0
Row : 10000
Row : 20000
Row : 30000
Row : 40000
Row : 50000
Row : 60000
Row : 70000
Row : 80000
Row : 90000
Row : 100000
Row : 110000
Row : 120000
Row : 130000
Row : 140000
Row : 150000
Row : 160000
Row : 170000
Row : 180000
Row : 190000
Row : 200000
Row : 210000
Row : 220000
Row : 230000
Row : 240000
Row : 250000
Row : 260000
Row : 270000
Row : 280000
Row : 290000
Row : 300000
Row : 310000
Row : 320000
Row : 330000
Row : 340000
Row : 350000
Row : 360000
Row : 370000
Row : 380000
Row : 390000
Row : 400000
Row : 410000
Row : 420000
Row : 430000
Row : 440000
Row : 450000
Row : 460000
Row : 470000
Row : 480000
Row : 490000
Row : 500000
Row : 510000
Row : 520000
Row : 530000
Row : 540000
Row : 550000
Row : 560000
Row : 570000
Row : 580000
Row : 590000
Row : 600000
Row : 610000
Row : 620000
Row : 630000
Row : 640000
Row : 650000
Row : 660000
Row : 670000
Row : 680000
Row : 690000
Row : 700000
Row : 710000
Row : 720000
Row : 730000
Row : 740000
Row : 750000
Row : 760000
Row : 770000
