# Create Dataset 
Looks at the violent, and nonviolent files, and creates a single
dict with the violent and nonviolent subjects, and creates a list of posts for each user, with an id for each post 

In [1]:
# imports
import random
import re
import string
import json
import csv
import os
from tqdm import tqdm
from nltk.tokenize import TweetTokenizer

In [2]:
# initialize tokenizer
tk = TweetTokenizer()
# initialize tokens to remove
remove_tokens = ['"', "-", '“', '”']
# types of datasets
datasets = ["non_violent", "violent"]
# other 
subject = 0
subject_post = 0
# for 
meta_data = dict()

In [3]:
# initialize output directory
output_dir = "data_2"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [4]:
# preprocsessing for each line at the beginning
def clean_initial_line(line):
    line = line.lstrip()
    line = line.rstrip()
    line = line.strip()
    line = line.lower()
    return line

In [5]:
# creates a dictionary of a post id to a clean post for a particular subject
def prepare_post(line, subject_post):
    re.sub('^[0-9]*$', "NUM", line)
    words = tk.tokenize(line)
    for item in remove_tokens:
        if item in words : words.remove(item)
    return {subject_post: ' '.join(words)}

In [6]:
# function to load data from the text file

def load_data(fp):
    data = dict()
    for line in tqdm(f):
        line = clean_initial_line(line)
        # lines with a number indicate new subject
        if line.isnumeric():
            subject = line
            if subject not in data:
                data[subject] = list()
                subject_post=0
            else:
                subject_post=list(data[subject][-1].keys())[0]+1
        elif len(line.split(" ")) < 2:
            continue
        else:
            data[subject].append(prepare_post(line, subject_post))
            subject_post += 1
    return data

In [7]:
# creates a dictionary with violent subjects, and non violent subjects
# with a list of posts for each subject
all_data = dict()
for dataset in datasets:
    with open("txt_data/{}.txt".format(dataset), 'r') as f:
        all_data[dataset] = load_data(dataset)

843it [00:00, 5800.73it/s]
806it [00:00, 9831.30it/s]


In [None]:
all_data

In [8]:
# save file
with open(os.path.join(output_dir, 'cleaned_data'), 'w') as fp:
    json.dump(all_data, fp, indent=4)

In [9]:
# number of non_violent subjects
len(all_data["non_violent"])

23

In [10]:
# number of violent subjects
len(all_data["violent"])

23

In [11]:
# this function creates a list of all posts for violent/nonviolent datasets
def aggregate_data(data, label):
    if label =="violent": label = 1 
    else: label=0
    aggregated_data=[]
    subjects =  list(data.keys()) 
    random.shuffle(subjects)
    for subject in subjects:
        for post in data[subject]:
            post_items = list(post.items())[0]
            aggregated_data.append((subject, post_items[0], post_items[1], label))
    return aggregated_data

In [12]:
# create list of all violent/nonviolent posts
data = {}
for dataset in datasets:
    data[dataset] = aggregate_data(all_data[dataset], dataset)

In [None]:
data

In [13]:
# Split int train/val, test sets
# makes sure a subject is not in both train, and test datasets
# to get different result, re run the cell to create list of posts
split = .8
final_data = dict()
splits = {}

for dataset in datasets:
    val = list()
    train = list()
    # get subject id of the last subject in training
    split_idx = int(split*len(data[dataset]))
    split_ex = data[dataset][split_idx][0]
    
    is_train = True
    for example in data[dataset]:
        
        if example[0]==split_ex:
            is_train = False
            train.append(example)
            #print("{},{}".format(example[0], "train")
        elif is_train is True:
            train.append(example)
            #print("{},{}".format(example[0], "train")
        else:
            val.append(example)
    
    print(dataset)
    print(len(train))
    print(len(val))
    
    # collect data on splits
    splits[dataset] = {"train":len(train),
                       "val":len(val)}
    # save to larger dictionary
    final_data[dataset] = {"train":train, "val":val}

non_violent
592
111
violent
490
85


In [14]:
# combine violent and non violent for train and val datasets
dataset_train = final_data["violent"]["train"]+final_data["non_violent"]["train"]
dataset_val = final_data["violent"]["val"]+final_data["non_violent"]["val"]

In [15]:
len(dataset_train)

1082

In [16]:
len(dataset_val)

196

In [17]:
# get size of train test sets
splits["all"] = {"train":len(dataset_train),
                 "val":len(dataset_val)}

In [18]:
# save datasets
with open(os.path.join(output_dir, "train.csv"), 'w') as fp:
    csv_writer = csv.writer(fp)
    for row in dataset_train:
        csv_writer.writerow(row)
with open(os.path.join(output_dir, "val.csv"), 'w') as fp:
    csv_writer = csv.writer(fp)
    for row in dataset_val:
        csv_writer.writerow(row)

In [19]:
def collect_meta_data(data):
    key = data[0][0]
    num_posts = 1
    sub_word_count = 0
    total_words = 0
    num_subjects =1
    subject_data = dict()
    for ex in data:
        num_tokens = len(re.findall(r'\w+', ex[2]))
        if ex[0]!=key:
            subject_data[key] = {"num_posts":num_posts, "length/post":sub_word_count/num_posts}
            key = ex[0]
            sub_word_count=num_tokens
            num_posts = 1
            num_subjects +=1
        else:
            num_posts+=1
            sub_word_count+=num_tokens
        total_words += num_tokens

    meta_data = {"subject_data" : subject_data}
    meta_data["num_posts"] = len(data)
    meta_data["num_subjects"] = num_subjects
    meta_data["average_posts_per_sub"] = len(data)/num_subjects
    meta_data["words_per_subject"] = (total_words/len(data))
    return meta_data

In [20]:
meta_data["violent"] = collect_meta_data(data["violent"])
meta_data["non_violent"] = collect_meta_data(data["non_violent"])
meta_data["splits"] = splits
print(json.dumps(meta_data, indent=4))

{
    "violent": {
        "subject_data": {
            "2258": {
                "num_posts": 12,
                "length/post": 34.166666666666664
            },
            "1254": {
                "num_posts": 8,
                "length/post": 10.5
            },
            "2425": {
                "num_posts": 38,
                "length/post": 55.921052631578945
            },
            "9333": {
                "num_posts": 10,
                "length/post": 35.6
            },
            "9989": {
                "num_posts": 1,
                "length/post": 25.0
            },
            "7981": {
                "num_posts": 50,
                "length/post": 40.28
            },
            "5372": {
                "num_posts": 29,
                "length/post": 39.793103448275865
            },
            "3942": {
                "num_posts": 99,
                "length/post": 48.282828282828284
            },
            "4234": {
                "num_posts": 2

In [21]:
with open(os.path.join(output_dir, "metadata.json"), "w") as f:
    json.dump(meta_data, f, indent=4)