In [1]:
import numpy as np
import pandas as pd
import os

# Filename
dataset_path = "../dataset/TrustPilot/"

valid_filename = "../dataset/TrustPilot_processed/valid.jsonl"
train_filename = "../dataset/TrustPilot_processed/train.jsonl"
test_filename = "../dataset/TrustPilot_processed/test.jsonl"

In [2]:
def load_web_eng(filename = ""):
    lines = list( open(filename, "r", encoding='utf8').readlines() )
    lines = [ l.strip() for l in lines]

    doc = []
    tags = []
    sent_w = []
    sent_t = []
    for l in lines:
        if l == '':
            doc.append(sent_w)
            tags.append(sent_t)
            sent_w = []
            sent_t = []
        else:
            w, t = l.split('\t')
            if t != "-NONE-":
                sent_w.append( w.lower() )
                sent_t.append( t )
    return doc, tags

def load_trustpilots():
    all_sents = []
    all_tags = []
    all_genders = []
    all_ages = []
    filenames = ["en.O45-UKC1_WORST-F.data.TT.tagged.gold",
        "en.O45-UKC1_WORST-M.data.TT.tagged.gold",
        "en.O45-UKH2_SOSO-F.data.TT.tagged.gold",
        "en.O45-UKH2_SOSO-M.data.TT.tagged.gold",
        "en.O45-UKN0_BEST-F.data.TT.tagged.gold",
        "en.O45-UKN0_BEST-M.data.TT.tagged.gold",
        "en.U35-UKC1_WORST-F.data.TT.tagged.gold",
        "en.U35-UKC1_WORST-M.data.TT.tagged.gold",
        "en.U35-UKH2_SOSO-F.data.TT.tagged.gold",
        "en.U35-UKH2_SOSO-M.data.TT.tagged.gold",
        "en.U35-UKN0_BEST-F.data.TT.tagged.gold",
        "en.U35-UKN0_BEST-M.data.TT.tagged.gold"
        ]
    for i, filename in enumerate(filenames):
        sents, tags = load_web_eng(dataset_path + filename)
        if i < 6: 
            ages = np.array( [1] * len(sents) ) #over 45
        else:
            ages = np.array( [0] * len(sents) ) #under 35
        if i % 2 == 0:
            genders = np.array( [1] * len(sents) ) # F
        else:
            genders = np.array( [0] * len(sents) ) # M

        all_sents.extend(sents)
        all_tags.extend(tags)
        all_genders.extend(genders)
        all_ages.extend(ages)
    return all_sents, all_tags, np.array(all_genders), np.array(all_ages)


In [3]:
TP_sents, TP_tags, TP_gender, TP_age = load_trustpilots()

In [4]:
total_array = []
for i in range(len(TP_sents)):
    total_array.append({'text':TP_sents[i], 
                        'tag_label':TP_tags[i], 
                        'age_label':int(TP_age[i]),
                        'gender_label':int(TP_gender[i])
                       })

In [5]:
from sklearn.model_selection import train_test_split

train_array, test_array = train_test_split(total_array, test_size=0.1, random_state=25042020)
train_array, valid_array = train_test_split(train_array, test_size=0.1, random_state=25042020)

In [6]:
print(len(train_array),
      len(valid_array),
      len(test_array))

486 54 60


In [7]:
import jsonlines

def write2jsonl(jl_object, filePath):
    with jsonlines.open(filePath, mode='w') as writer:
        writer.write_all(jl_object)

In [8]:
write2jsonl(train_array, train_filename)
write2jsonl(test_array, test_filename)
write2jsonl(valid_array, valid_filename)