# Imports and Datasets


In [1]:
import torch
import json
import requests
from sklearn.model_selection import train_test_split

In [3]:
request_NER_TRAIN = requests.get("http://AdityaAhuja01.pythonanywhere.com/data/NLP_Data/NER_TRAIN_JUDGEMENT.json")
request_NER_TEST = requests.get("http://AdityaAhuja01.pythonanywhere.com/data/NLP_Data/NER_TEST_JUDGEMENT.json")
request_LR_VAL = requests.get("http://AdityaAhuja01.pythonanywhere.com/data/NLP_Data/Laptop_Review_Val.json")
request_LR_TEST = requests.get("http://AdityaAhuja01.pythonanywhere.com/data/NLP_Data/Laptop_Review_Test.json")
request_LR_TRAIN = requests.get("http://AdityaAhuja01.pythonanywhere.com/data/NLP_Data/Laptop_Review_Train.json")


import os
if not os.path.exists("./data"):
    os.makedirs("./data")

with open("./data/NER_train.json", "x") as file:
    file.write(request_NER_TRAIN.text)

with open("./data/NER_test.json", "x") as file:
    file.write(request_NER_TEST.text)
    
with open("./data/LR_Val.json", "x") as file:
    file.write(request_LR_VAL.text)
    
with open("./data/LR_test.json", "x") as file:
    file.write(request_LR_TEST.text)

with open("./data/LR_Train.json", "x") as file:
    file.write(request_LR_TRAIN.text)

In [None]:
NER_train_file = open("./data/NER_train.json")
NER_train_json = json.load(NER_train_file)

NER_test_file = open("./data/NER_test.json")
NER_test_json = json.load(NER_test_file)

LR_train_file = open("./data/LR_Train.json")
LR_train_json = json.load(LR_train_file)

LR_val_file = open("./data/LR_Train.json")
LR_val_json = json.load(LR_val_file)

LR_test_file = open("./data/LR_Train.json")
LR_test_json = json.load(LR_test_file)


In [None]:
class LR_Preprocessor:
    def __init__(self,trainset,testset,valset):
        self.trainset = trainset
        self.testset = testset
        self.valset = valset

    def init_tags(self,dataset):
        labeled_outputs = []
        id = 0
        for entry in dataset:
            tag_entry = {}
            tag_entry["id"] = id 
            tag_entry["text"] = entry["raw_words"]
            tag_entry["labels"] = len(entry["words"])*["O"]
            for aspect in entry["aspects"]:
                for index in range(aspect["from"],aspect["to"]):
                    if index == aspect["from"]:
                        tag_entry["labels"][index] = "B"
                    else:
                        tag_entry["labels"][index] = "I"
                        
            labeled_outputs.append(tag_entry)   
            id += 1
        return labeled_outputs
        
    def initizalize(self):
        self.labeled_trainset = self.init_tags(self.trainset)
        self.labeled_valset = self.init_tags(self.valset)
        self.labeled_testset = self.init_tags(self.testset)
    
    def get_tagged_data(self):
        return self.labeled_trainset,self.labeled_valset,self.labeled_testset
        
            

In [None]:
class NER_Preprocessor:
    def __init__(self, trainset, testset, valset=None):
        self.trainset = trainset
        self.testset = testset
        self.valset = valset
        if valset is None:
            self.split_val()
    
    def split_val(self, split_train_false=False):
        if split_train_false:
            self.trainset = self.trainset + self.valset
        self.trainset, self.valset = train_test_split(
            self.trainset, test_size=0.15, random_state=42)

    def init_tags(self, dataset):
        labeled_output = []
        for entry in dataset:
            tag_entry = {}
            tag_entry["id"] = entry["id"]
            tag_entry["text"] = entry["data"]["text"]
            annotations = entry["annotations"][0]
            data = entry["data"]
            meta = entry["meta"]
            tag_entry["labels"] = []
            char_itr = 0
            for result_obj in annotations["result"]:
                if char_itr > result_obj["value"]["start"]:
                    continue
                tagged_words = result_obj["value"]["text"].split(" ")
                num_words = len(tagged_words)
                tags = "B_" + result_obj["value"]["labels"][0] + \
                    (" I_"+result_obj["value"]["labels"][0]) * (num_words-1)
                while (char_itr < result_obj["value"]["start"]):
                    if (entry["data"]["text"][char_itr] == " "):
                        tag_entry["labels"].append("O")
                    char_itr += 1

                for tag in tags.split(" "):
                    tag_entry["labels"].append(tag)

                char_itr = result_obj["value"]["end"]
                while (char_itr < len(entry["data"]["text"]) and entry["data"]["text"][char_itr] != " "):
                    char_itr += 1
                char_itr += 1

            while (char_itr < len(entry["data"]["text"])):
                if (entry["data"]["text"][char_itr] == " "):
                    tag_entry["labels"].append("O")
                char_itr += 1

            if char_itr == len(entry["data"]["text"]):
                tag_entry["labels"].append("O")
            
            labeled_output.append(tag_entry)
        
        return labeled_output
      
    def initialize(self):
        self.labeled_trainset = self.init_tags(self.trainset)
        self.labeled_valset  = self.init_tags(self.valset)
        self.labeled_testset = self.init_tags(self.testset)

    def get_tagged(self):
        return (self.labeled_trainset, self.labeled_valset, self.labeled_testset)

In [None]:
NER_preprocessor = NER_Preprocessor(NER_train_json, NER_test_json)
NER_preprocessor.initialize()
NER_train,NER_val,NER_test = NER_preprocessor.get_tagged()

In [None]:
if (not os.path.exists("./processed")):
    os.makedirs("./processed")

with open("./processed/NER_train_tagged.json", "w") as file:
    json.dump(NER_train, file)
    
with open("./processed/NER_val_tagged.json", "w") as file:
    json.dump(NER_val, file)

with open("./processed/NER_test_tagged.json", "w") as file:
    json.dump(NER_test, file)

In [None]:
LR_Preprocessor = LR_Preprocessor(LR_train_json,LR_test_json,LR_val_json)
LR_Preprocessor.initizalize()
LR_train,LR_val,LR_test = LR_Preprocessor.get_tagged_data()

In [None]:
with open("./processed/LR_train_tagged.json", "w") as file:
    json.dump(LR_train, file)

with open("./processed/LR_val_tagged.json", "w") as file:
    json.dump(LR_val, file)
    
with open("./processed/LR_test_tagged.json", "w") as file:
    json.dump(LR_test, file)