In [3]:
import json
import random
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy
from pycipher import ColTrans as ctc
from pycipher import SimpleSubstitution as ssc
from pycipher import Caesar as cc

In [42]:
# Helper function to convert string to uppercase, no-spaced alphabetical equivilent.
def as_alpha_upper(text):
    import re
    alpha_filter = filter(str.isalpha, text)
    alpha_string = "".join(alpha_filter).upper()
    alpha_string = re.sub(r'[^\x00-\x7f]',r'', alpha_string)
    return alpha_string

In [45]:
# Hyperparameters
NUM_PTS = 1000;
NUM_METHODS = 6;

In [46]:
# Creating the dataset of encryptable values and their keys from a json file.
questions = []
answers = []

# We use SQuAD, a dataset of questions and answers: https://rajpurkar.github.io/SQuAD-explorer/
with open('SQuAD_1.1_dataset.json', 'r') as json_file:
    json_dataset = json.load(json_file)
    
for data in json_dataset['data']:
    for paragraphs in data['paragraphs']:
        for qas in paragraphs['qas']:
            if (qas['answers']):
                questions.append(qas['question'])
                answers.append(qas['answers'][0]['text'])
                
# We use the questions as the plaintext to be encrypted.
texts = np.random.choice(questions, size=NUM_PTS)

# We use the answers as the keys for Columnar Transposition.
ct_keys = np.random.choice(answers, size=NUM_PTS)

# We use randomly shuffled alphabet as the keys for Simple Substitution.
alphabet = "abcdefghijklmnopqrstuvwxyz"
ss_keys = []
for i in range(NUM_PTS):
    ss_keys.append(''.join(random.sample(alphabet, len(alphabet))))
    
# We can use the numbers 0~25 as the keys for Caesar.
c_keys = np.arange(26)

In [47]:
# Encrypting.
vary_key_ctc = []
vary_text_ctc = []
vary_key_ssc = []
vary_text_ssc = []
vary_key_cc = [0] * NUM_PTS
vary_text_cc = []

# Datasets for constant plaintext, varied key.
set_text = texts[0]
for key in ct_keys:
    vary_key_ctc.append(ctc(key).encipher(set_text))
for key in ss_keys:
    vary_key_ssc.append(ssc(key).encipher(set_text))
for i in range(len(c_keys)):
    vary_key_cc[i] = (cc(c_keys[i]).encipher(set_text)) # Only first 26 have values; rest are 0-filled.
    
# Datasets for varied plaintext, constant key.
ct = ctc(ct_keys[0])
ss = ssc(ss_keys[0])
c = cc(c_keys[13])
for text in texts:
    vary_text_ctc.append(ct.encipher(text))
    vary_text_ssc.append(ss.encipher(text))
    vary_text_cc.append(c.encipher(text))
    
# Create a 2D array containing all encrypted datasets.
encrypted_datasets = np.column_stack((vary_key_ctc, vary_text_ctc, vary_key_ssc, vary_text_ssc, vary_key_cc, vary_text_cc))

# Transpose the array axis to make each enciphering method its own row/line in the csv.
encrypted_datasets = np.transpose(encrypted_datasets)

In [49]:
# Export to .csv for external reuse
np.savetxt("encrypted_datasets_" + str(NUM_PTS) + ".csv", encrypted_datasets, delimiter=",", fmt = '%s')