# Data Statistics Summary

In [2]:
import json
import os
import csv
import math
from multiprocessing import Pool, Manager
from tqdm import tqdm
from collections import defaultdict
import numpy as np

In [3]:
class Data_Stats_Processor:
    def __init__(self) -> None:
        known_writer_id_set, unknown_writer_id_set, writer_id_to_index_mapping_dict, writer_index_to_id_mapping_dict = self.load_writer_info()
        self.known_writer_id_set = known_writer_id_set
        self.unknown_writer_id_set = unknown_writer_id_set
        self.writer_id_to_index_mapping_dict = writer_id_to_index_mapping_dict
        self.writer_index_to_id_mappping_dict = writer_index_to_id_mapping_dict
        pass

    def output_writer_num_to_id_mapping_dict(self):
        id_to_num_mapping_file_path = "./output/writer_100_review_40/writer_id_to_num_mapping_dict.json"

        with open(id_to_num_mapping_file_path, mode="r") as fin:
            id_to_num_dict = json.load(fin)
        # endwith

        num_to_id_dict = {}
        for id, index in id_to_num_dict.items():
            num_to_id_dict[int(index)] = id
        # endfor

        # ######## output #########
        output_file_path = "./output/writer_100_review_40/writer_num_to_id_mapping_dict.json"
        with open(output_file_path, mode="w") as fout:
            json.dump(num_to_id_dict, fout)
        # endwith
        pass

    def load_writer_info(self):
        # (1)
        known_writer_id_file = "./output/writer_100_review_40/known_writer_id_list.json"
        with open(known_writer_id_file, mode="r") as fin:
            known_writer_id_set = json.load(fin)
            known_writer_id_set = set(known_writer_id_set)
        # endwith

        # (2)
        unknown_writer_id_file = "./output/writer_100_review_40/unknown_writer_id_list.json"
        with open(unknown_writer_id_file, mode="r") as fin:
            unknown_writer_id_set = json.load(fin)
            unknown_writer_id_set = set(unknown_writer_id_set)
        # endwith

        # !! Include shipping writer here
        # --------------------------- load unknown writer_id -----------------------
        # all the unknown writer are in the shipping review dataset
        shipping_reviewer_id_stats_file = "./shipping_review_input/shipping_reviewer_stats.txt"
        # TODO: when generate final dataset, here it uses set, not list, so that order is lost
        # TODO: anyway, any writer id better than 310 (mapping dict start from 1), start from 311 are unknown writers.
        shipping_unknown_writer_id_list = []
        with open(shipping_reviewer_id_stats_file, mode="r") as fin:
            for line in fin:
                line = line.strip()
                parts = line.split()
                shipping_unknown_writer_id_list.append(parts[0])
            # endfor
        # endwith
        # update unknown writer
        unknown_writer_id_set.update(shipping_unknown_writer_id_list)


        # (3)
        id_to_num_mapping_file_path = "./output/writer_100_review_40/writer_id_to_num_mapping_dict.json"
        with open(id_to_num_mapping_file_path, mode="r") as fin:
            writer_id_to_index_mapping_dict = json.load(fin)
        # endwith
        for shipping_writer_id in shipping_unknown_writer_id_list:
            writer_id_to_index_mapping_dict[shipping_writer_id] = len(writer_id_to_index_mapping_dict) + 1
        #endfor

        # (4)
        writer_index_to_id_mapping_dict = {}
        for k, v in writer_id_to_index_mapping_dict.items():
            writer_index_to_id_mapping_dict[v] = k
        #endfor

        return known_writer_id_set, unknown_writer_id_set, writer_id_to_index_mapping_dict, writer_index_to_id_mapping_dict

    def single_worker_get_writer_stats(self, file_path_list):

        known_writer_sample_num_dict = defaultdict(int)
        unknown_writer_sample_num_dict = defaultdict(int)

        known_writer_doc_list_dict = defaultdict(set)
        unknown_writer_doc_list_dict = defaultdict(set)

        # text length stats
        text_length_list = []

        # unique combination of writer, product category, sentiment, review types in train, non-novel-test and novel-test
        known_writer_text_pair_set = set()
        unknown_writer_text_pair_set = set()

        for file_path in tqdm(file_path_list, desc="all files"):
            with open(file_path, mode="r", encoding="utf16") as fin:
                csv_reader = csv.DictReader(fin, delimiter=",", quotechar="|")
                # instanceid,text,reported_writer_id,real_writer_id,sentiment,product,novelty_indicator,novel_instance,text_id
                for row in csv_reader:
                    text = row["text"]

                    # text length stats
                    text_len = len(text.strip().split())
                    text_length_list.append(text_len)

                    real_writer_id = int(row["real_writer_id"])
                    real_writer_str = self.writer_index_to_id_mappping_dict[real_writer_id]

                    assert real_writer_str in self.known_writer_id_set or real_writer_str in self.unknown_writer_id_set

                    if real_writer_str in self.known_writer_id_set:
                        known_writer_sample_num_dict[real_writer_str] += 1
                        # add document set
                        known_writer_doc_list_dict[real_writer_str].add(file_path)
                        # writer text pair
                        known_writer_text_pair_set.add((real_writer_str, text))
                    # endif

                    if real_writer_str in self.unknown_writer_id_set:
                        unknown_writer_sample_num_dict[real_writer_str] += 1
                        # add document set
                        unknown_writer_doc_list_dict[real_writer_str].add(file_path)
                        # writer text pair
                        unknown_writer_text_pair_set.add((real_writer_str, text))
                    # endif
                # endfor
            # endwith
        # endfor

        # get document frequency dict
        known_writer_df_dict = {}
        for k, v in known_writer_doc_list_dict.items():
            known_writer_df_dict[k] = len(v)
        #endif

        unknown_writer_df_dict = {}
        for k, v in unknown_writer_doc_list_dict.items():
            unknown_writer_df_dict[k] = len(v)
        #endif


        return text_length_list, known_writer_sample_num_dict, unknown_writer_sample_num_dict, known_writer_df_dict, unknown_writer_df_dict, known_writer_text_pair_set, unknown_writer_text_pair_set

In [4]:
folder = "./output/writer_100_review_40/NLT_complete_trials_Nov_9_2021/OND/NLT"
all_file_path_list = []
for root, subdir, file_list in os.walk(folder):
    for file_name in file_list:
        if file_name.endswith(".csv"):
            file_path = os.path.join(root, file_name)
            all_file_path_list.append(file_path)
        # endif
    # endfor
# endwith
print(f"There are {len(all_file_path_list)} csv files.")

There are 864 csv files.


In [5]:
processor = Data_Stats_Processor()

# ###### single ######

text_length_list, known_writer_sample_num_dict, unknown_writer_sample_num_dict, \
known_writer_df_dict, unknown_writer_df_dict, \
known_writer_text_pair_set, unknown_writer_text_pair_set = processor.single_worker_get_writer_stats(all_file_path_list)

all files: 100%|██████████| 864/864 [00:33<00:00, 25.95it/s]


In [6]:
# text stats
text_length_arr = np.array(text_length_list)
print(f"text len mean: {text_length_arr.mean()}")
print(f"text len min: {np.amin(text_length_arr)}")
print(f"text len max: {np.amax(text_length_arr)}")

text len mean: 218.1146556712963
text len min: 1
text len max: 4683


In [7]:
# unique combinations of writer, product category, sentiment, review types did we actual use in training, non-novel test and novel-test
# using (writer_id, text) is enough
print(f"Test data unique entries known writer: {len(known_writer_text_pair_set)}")
print(f"Test data unique entries unknown writer: {len(unknown_writer_text_pair_set)}")

Test data unique entries known writer: 3995
Test data unique entries unknown writer: 14576


In [6]:
# normalize
normalized_known_writer_sample_num_dict = {k: (v * 1.0 / known_writer_df_dict[k]) for k, v in known_writer_sample_num_dict.items()}
normalized_unknown_writer_sample_num_dict = {k: (v * 1.0 / unknown_writer_df_dict[k]) for k, v in unknown_writer_sample_num_dict.items()}

In [36]:
test_data_known_writer_sample_num_arr = np.array(list(known_writer_sample_num_dict.values()))
test_data_unknown_writer_sample_num_arr = np.array(list(unknown_writer_sample_num_dict.values()))

normalized_test_data_known_writer_sample_num_arr = np.array(list(normalized_known_writer_sample_num_dict.values()))
normalized_test_data_unknown_writer_sample_num_arr = np.array(list(normalized_unknown_writer_sample_num_dict.values()))

print(f"Total unknown writer: {len(processor.unknown_writer_id_set)}")


print("################## NON-normalized ###########")
print("---------------------")
print("Test dataset KNOWN writer:")
print(f"mean: {np.mean(test_data_known_writer_sample_num_arr)}")
print(f"min: {np.amin(test_data_known_writer_sample_num_arr)}")
print(f"max: {np.amax(test_data_known_writer_sample_num_arr)}")
print("---------------------")
print(f"Test dataset UNKNOWN writer:")
print(f"mean: {np.mean(test_data_unknown_writer_sample_num_arr)}")
print(f"min: {np.amin(test_data_unknown_writer_sample_num_arr)}")
print(f"max: {np.amax(test_data_unknown_writer_sample_num_arr)}")


print("################## normalized ################")
print("---------------------")
print("Test dataset KNOWN writer:")
print(f"mean: {np.mean(normalized_test_data_known_writer_sample_num_arr)}")
print(f"min: {np.amin(normalized_test_data_known_writer_sample_num_arr)}")
print(f"max: {np.amax(normalized_test_data_known_writer_sample_num_arr)}")
print("---------------------")
print(f"Test dataset UNKNOWN writer:")
print(f"mean: {np.mean(normalized_test_data_unknown_writer_sample_num_arr)}")
print(f"min: {np.amin(normalized_test_data_unknown_writer_sample_num_arr)}")
print(f"max: {np.amax(normalized_test_data_unknown_writer_sample_num_arr)}")


Total unknown writer: 934
################## NON-normalized ###########
---------------------
Test dataset KNOWN writer:
mean: 5972.16
min: 998
max: 27764
---------------------
Test dataset UNKNOWN writer:
mean: 470.64668094218416
min: 83
max: 3630
################## normalized ################
---------------------
Test dataset KNOWN writer:
mean: 7.01575254696484
min: 1.476858345021038
max: 32.13425925925926
---------------------
Test dataset UNKNOWN writer:
mean: 1.6390561618072932
min: 1.0
max: 7.546777546777546
