# Data Generation

Consider following questions

- How robust is classification with respect to key rotation?
    - Compare model trained on ciphers with fixed key to model train on ciphers with rotating key
    - Compare model on test data with same or different fixed key
  
- How robust is classification with respect to types of ciphers?
    - Leave one of the ciphers out of the train set
   
- Is there a difference between manually sent messages and censor data?
    - Sensor data has higher entropy and is more "similar" to ciphertexts
    - Treat manual and sensor separately first
    - Later test multi-class problem
    
- How robust is classification with respect to short messages?
    - Investigate whether message length makes a difference

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

import sys
sys.path.append('../')
from src.data_utils import *
from src.cipher_utils import *

# Init

In [3]:
delimiter = chr(0x06)

alphabet = get_ascii_alphabet()

cipher_map = initialize_cipher_machines()

seed(42)

In [4]:
dpath = '../data/full_acars.pkl'
df = pd.read_pickle(dpath)
df.info()

FileNotFoundError: [Errno 2] No such file or directory: '../data/full_acars.pkl'

In [9]:
df_sat = df.groupby('db_channel').get_group('sat')
print('Total number of messages sent via satellite: %d'%len(df_sat))

Total number of messages sent via satellite: 761320


# Sensor data
## Load & split data

In [17]:
sensor_messages = list(df_sat[df_sat['Label'] == 'H1']['Txt'])
print('Loaded %d messages with label H1 (plain_sensor)'%(len(sensor_messages)))

shuffle(sensor_messages)

num_train = int(0.75 * len(sensor_messages))

sensor_train = sensor_messages[:num_train]
sensor_test = sensor_messages[num_train:]

Loaded 178914 messages with label H1 (plain_sensor)


## Training data with fixed key

##### Get ciphertexts from plaintexts

In [18]:
ciphers_sensor_train = msg_to_cipher(sensor_train, cipher_map, alphabet, key_rot=len(sensor_train))

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


##### Sanity check that messages can be decrypted correctly

In [19]:
test_decrypt(ciphers_sensor_train, cipher_map, 'vigenere', sensor_train, 100)

True


##### Generate dataframe with training data 

In [20]:
plain_sensor_train = {'txt': sensor_train, 'label': ['plain' for _ in sensor_train]}
sensor_train_fixed = generate_dataframe(plain_sensor_train, ciphers_sensor_train)

In [24]:
sensor_train_fixed.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
498936,"R&7Ls""-pH5c[Oz_2vE&n)(SU'T5S$(c:=...",cipher,vigenere,&bj\tEaghT pq2MAKYLe/Q.\r|
23797,- #MD/AA YQXE2YA.AT1.N66057213623E2408B80,plain,plain,
287440,I2.G78Q AA60A2YC-. DYB0H6D1E...,cipher,columnar,"dOT*i\ro%,l>rR=$""!�[72mPk1pQ@L;VqB\..."
202696,=03]T?QF0^TZSQiQ>QTc>`X=S[S@H@UACB@FSB@@HAD@QV...,cipher,caesar,16
70377,"- #MD/A0 YQXD2YA.AFN/FMHRCH433,.40061B,,075638...",plain,plain,


In [22]:
sensor_train_fixed.groupby('Label').size()

Label
cipher    670925
plain     134185
dtype: int64

In [None]:
sensor_train_fixed.to_pickle('../data/train_sensor_fixed_key.pkl')

## Test data with same fixed key
##### Apply same key as train

In [25]:
train_key_map = {}
for cm, data in sensor_train_fixed.groupby('Cipher'):
    print(cm)
    train_key_map[cm] = list(data['Key'].unique())

caesar
columnar
fakeaes
plain
substitution
vigenere


In [26]:
ciphers_sensor_test = msg_to_cipher(sensor_test, cipher_map, alphabet, key_map=train_key_map)

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


In [27]:
plain_sensor_test = {'txt': sensor_test, 'label': ['plain' for _ in sensor_test]}
sensor_test_fixed = generate_dataframe(plain_sensor_test, ciphers_sensor_test)

In [28]:
sensor_test_fixed.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
23274,"- #T20PWYWD360,RAPIX,292031,360M57.KENET,26704...",plain,plain,
35792,- #MD/AA SMACAYA.AT1.55142A6288D38501B16D,plain,plain,
38172,"- #MDFTX/ID21100A,RCH975,JAM6212X1344/MR12,91/...",plain,plain,
235146,1Q)7ROZef%`]zXpS^hDWLF.'l_ gC[uj~aHtKv:�...,cipher,fakeaes,103
177476,"R&7L80c""&g4BpO|d6f%2S7:yM!!d4D...",cipher,vigenere,&bj\tEaghT pq2MAKYLe/Q.\r|


In [None]:
sensor_test_fixed.to_pickle('../data/test_sensor_fixed_key.pkl')

#### Generate test data with rotating keys

In [29]:
ciphers_sensor_test = msg_to_cipher(sensor_test, cipher_map, alphabet, key_rot=1000)

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


In [30]:
sensor_test_different = generate_dataframe(plain_sensor_test, ciphers_sensor_test)

In [31]:
sensor_test_different.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
101989,"90100R78,0M,M8V97WMU60S20S3M1,00 7,,.U,T330830...",cipher,columnar,]O*Bp+YV9qM_~?v4[.0\rl32ArxRytE5Nj
1363,.LISOZTP 231453\nFCD\nAN CS-TOA/FI TP287\n- #M...,plain,plain,
231975,"bt N}'Mmz]Qg&=5Fsr#�j\a~%UwR;\nO""y$KH<0",cipher,fakeaes,47
145859,8'&9QdF+t_eO$PJL-3TUY+^L?\n�%q:\PIL:9F&{...,cipher,vigenere,loG| 3Vxz^h\%$*{/b\rZQu:p
158672,"N\+#-xx'FFnT<""8Jtd4%+N3*F]/$u\rT-[{V^u�G",cipher,vigenere,"""=a9gt?lY+c|I_fGOb`z%yo5Uix(;*T>N..."


In [None]:
sensor_test_different.to_pickle('../data/test_sensor_different_key.pkl')

## Training data with rotating key 

In [32]:
ciphers_sensor_train = msg_to_cipher(sensor_train, cipher_map, alphabet, key_rot=1000)

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


In [33]:
sensor_train_rot = generate_dataframe(plain_sensor_train, ciphers_sensor_train).reset_index()

In [34]:
sensor_train_rot.sample(5)

Unnamed: 0,index,Txt,Label,Cipher,Key
562730,562730,h`Aandxpyx=nnASUdSnUUU!yyUyUUAUUU...,cipher,substitution,"}s<ZM(*mN'g7XcB4K85)]Y|f\nH$/`:�,q{h..."
390947,390947,N7F2733...,cipher,columnar,"gx;aV|`=9J$12\t8z-/hQn#e07pFPiIbd'""}k..."
367588,367588,"W3JM.,0/DNIDS.0L0E0M00P44O8650 04L13,51,...",cipher,columnar,"f2y&i+orT|X8S4,9JU/~`q@L""\n"
377447,377447,S6B46AAD0134-E6N1A...,cipher,columnar,"Nnew_y7igK|F6WB1�o-\njTGPZ}SIflYsU/""H..."
315970,315970,9-1M MYAAD4KIB98HAP...,cipher,columnar,"N""v6Y>8-,~Z!UWo;QjT(5^X24p0]E=qB IGu}xt..."


In [None]:
sensor_train_rot.to_pickle('../data/train_sensor_key_rot.pkl')

# Manual data
## Load & split data

In [35]:
manual_messages = list(df_sat[df_sat['Label'].isin(['RA', 'C1'])]['Txt'])
print('Loaded %d messages with label RA or C1 (plain_manual)'%(len(manual_messages)))

Loaded 93132 messages with label RA or C1 (plain_manual)


In [36]:
shuffle(manual_messages)

num_train = int(0.75 * len(manual_messages))

manual_train = manual_messages[:num_train]
manual_test = manual_messages[num_train:]

## Training data with fixed keys

In [37]:
ciphers_manual_train = msg_to_cipher(manual_train, cipher_map, alphabet, key_rot=len(manual_train))

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


In [38]:
plain_manual_train = {'txt': manual_train, 'label': ['plain' for _ in manual_train]}
manual_train_fixed_key = generate_dataframe(plain_manual_train, ciphers_manual_train)

In [45]:
manual_train_fixed_key.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
172286,LKF NIM HTSD\n 1 TUR 0 /P /B 0 BI\nHU4 L...,cipher,columnar,|mu\n'f\ti7jQ�
21864,.SINGMXS 292135\nAGM\nAN .F-OONE/GL AOE2\n- W...,plain,plain,
409493,"1GC#';SA[3E6q^f""? 0hV\\n>UYKD|O~m+y...",cipher,fakeaes,83
221024,=T0Sd6<rV9FseW$zvne(6.&1xq[]JhR%FQYC93|...,cipher,vigenere,"Yy]rwS'@1#:0""\VZgqAc4nj3)<vP2u$K..."
245736,*NC-Uf>LT6id:\rxh). ;F%SA3yu�d%rh.L;ES>!...,cipher,vigenere,"Yy]rwS'@1#:0""\VZgqAc4nj3)<vP2u$K..."


In [None]:
manual_train_fixed_key.to_pickle('../data/train_manual_fixed_key.pkl')

## Test data with same fixed key

In [40]:
train_key_map = {}
for cm, data in manual_train_fixed_key.groupby('Cipher'):
    print(cm)
    train_key_map[cm] = list(data['Key'].unique())

caesar
columnar
fakeaes
plain
substitution
vigenere


In [41]:
ciphers_manual_test = msg_to_cipher(manual_test, cipher_map, alphabet, key_map=train_key_map)

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


In [42]:
plain_manual_test = {'txt': manual_test, 'label': ['plain' for _ in manual_test]}
manual_test_fixed_key = generate_dataframe(plain_manual_test, ciphers_manual_test)

In [44]:
manual_test_fixed_key.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
122112,"W""}gMjx ->Ot4]9d)e@HC<Ur\tmhNa|_y:l/...",cipher,fakeaes,83
33197,HLFIPF88=u.J+8EFK8D�J+8EFK8DI<GFIK�GXik)...,cipher,caesar,118
24087,"HL;G:LCL8%(I<:<@M<;*.0+(,(�I<:<@M<;'$...",cipher,caesar,118
61594,H\n4CT8Z4SOE SQE\nFFJRTET4T-U 0SIUAUME1RE J...,cipher,columnar,|mu\n'f\ti7jQ�
84440,"*NN/k`38%6iyRKjeR\t >67{3<Rz�`E""&dIldX!...",cipher,vigenere,"Yy]rwS'@1#:0""\VZgqAc4nj3)<vP2u$K..."


In [None]:
manual_test_fixed_key.to_pickle('../data/test_manual_fixed_key.pkl')

## Test data with rotating keys

In [46]:
ciphers_manual_test = msg_to_cipher(manual_test, cipher_map, alphabet, key_rot=1000)

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


In [47]:
manual_test_different_key = generate_dataframe(plain_manual_test, ciphers_manual_test)

In [48]:
manual_test_different_key.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
36693,Vi|{ii�iHY]XaYZ2iou2ivHv_ZXiv2UHHvH\n\...,cipher,caesar,40
50268,M2EKD 0 L S 09P6/B0\nI 03/TH209S01\n5U\n(0 0 0...,cipher,columnar,I'&WNy${MpS`|R5/ rkZ[%QaD>#.i*<70=X?9\...
85416,^B\]/\tEl+.k~lU71'9&FMGFY<~Amz25a+W?q\q...,cipher,vigenere,mC\rX\tc4@za8MebZw?pSv.(':ulT5J~gWG{h0...
124630,"a"".^:59\nx-hNWd<!r6EF'1 XAl/S_0g...",cipher,fakeaes,49
21094,QUORYOAAF~1TNI01160716\nWEATHER REPORT\n METAR...,plain,plain,


In [None]:
manual_test_different_key.to_pickle('../data/test_manual_different_key.pkl')

## Training data with rotating keys 

In [49]:
ciphers_manual_train = msg_to_cipher(manual_train, cipher_map, alphabet, key_rot=1000)

--caesar--
Successfully encrypted all messages with cipher caesar
--columnar--
Successfully encrypted all messages with cipher columnar
--vigenere--
Successfully encrypted all messages with cipher vigenere
--substitution--
Successfully encrypted all messages with cipher substitution
--fakeaes--
Successfully encrypted all messages with cipher fakeaes


In [50]:
manual_train_key_rot = generate_dataframe(plain_manual_train, ciphers_manual_train)

In [51]:
manual_train_key_rot.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
205635,DI7SESDHE1\n87 A\n4IQV O K-/...,cipher,columnar,"8oV`%\rjA0_Jhlk|b""Lc^\gr[6Sf*(wFBp..."
85919,\nuuz3e\t}deekehij>yu\t|yTy\t>Ty\...,cipher,caesar,52
67242,470\n--SCHEDULED ARRIVAL--\n 2130/2140...,plain,plain,
3785,QUMLBSDCR~1LFOK WEATHER\n07-DEC-2016 10:59UTC\...,plain,plain,
237250,oo~|R@(q <?.K-|XCTUZn4h:3/\rkTVW3>3F...,cipher,vigenere,2-;X.{ZzUT$W\rhuoxLORqp*\1r[95\t]=


In [None]:
manual_train_key_rot.to_pickle('../data/train_manual_key_rot.pkl')

# Mix: Manual and sensor data
## Training data

In [52]:
plain_train = {'txt': sensor_train + manual_train, 
               'label': ['sensor' for _ in sensor_train] + ['manual' for _ in manual_train]}

In [53]:
plain_train_key_rot = generate_dataframe(plain_train, ciphers_sensor_train)

In [54]:
plain_train_key_rot.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
323712,"!A8#5*D=?7DM5""58G""D<!6:6$,$($5%,%&6%%'&)76&...",cipher,caesar,115.0
135963,.SINGMXS 112323\nAGM\nAN .VT-JEX/GL KSC1\n- W...,manual,plain,
769313,Y^iOLWXmH;}y&+9Ak]`\rx2_B!rG�U3a',cipher,fakeaes,13.0
821875,V3=uNj\tUh;w@7abkL[ Pxn$/E\nyAHR\|Oq6p...,cipher,fakeaes,27.0
768391,",@STLc U~}y5F>%tDQsvl9\nb<7E38R]X#zN",cipher,fakeaes,119.0


In [55]:
plain_train_key_rot.groupby('Label').size()

Label
cipher    670000
manual     69849
sensor    134185
dtype: int64

In [None]:
plain_train_key_rot.to_pickle('../data/train_mixed.pkl')

## Test data

In [56]:
plain_test = {'txt': sensor_test + manual_test, 
               'label': ['sensor' for _ in sensor_test] + ['manual' for _ in manual_test]}

In [57]:
plain_test_key_rot = generate_dataframe(plain_test, ciphers_sensor_test)

In [58]:
plain_test_key_rot.sample(5)

Unnamed: 0,Txt,Label,Cipher,Key
93295,gZ]~i{{Z\t}\n~{h}\rkhg}{lj{}}k{ro~{{�pn...,cipher,caesar,58.0
249600,[0Wq8k$H=]g_\v-iY(Z*2KsLt)NzAp\r19\n',cipher,fakeaes,118.0
287478,"VZo[\r,4]A%Y7fW;?!Fq>QSBdg@^L'�=$9Dx\ lP...",cipher,fakeaes,91.0
64394,01ISTBITK~1DIS01141319\nAVERAGE ETA\nA.D.ETA L...,manual,plain,
104294,";97! %! ;# <""# E %!$! $ #! ...",cipher,caesar,109.0


In [None]:
plain_test_key_rot.to_pickle('../data/test_mixed.pkl')

## Dummy data

Create dummy data without original sensitive message content to provide for example run

In [83]:
# Get a data sample
dummy = manual_train_key_rot.sample(1000).copy().reset_index()

In [84]:
# Get plaintext messages
plain_messages = list(dummy.groupby('Label').get_group('plain')['Txt'])

# Extract "words"
delimiter = ' '
words = []
for m in plain_messages:
    words.extend(m.split(delimiter))

In [85]:
from numpy.random import choice

# Create a function that generates some random message data that looks similar to plain txt messages
def sample_new_msg(m, words):
    return ' '.join(choice(words, size=len(m)))

In [88]:
pidx = dummy[dummy['Label'] == 'plain'].index
dummy.loc[pidx, 'Txt'] = dummy.loc[pidx, 'Txt'].apply(lambda m: sample_new_msg(m, words))

In [90]:
dummy.sample(5)

Unnamed: 0,index,Txt,Label,Cipher,Key
790,324488,-RmR:\t\t`j``o~3^'y:m/``o~/:.'B3y3/W3...,cipher,substitution,"$HQu1?y""M4�,XL[zS\b&Odh(*v/9P@g+aB..."
12,234113,[ISUAf!J&RlnU+kY!#S-P!UPV)e(mpxJ...,cipher,vigenere,tTyZMg+4(] 12OS[nf
105,357832,*q|Mig=e TJ86PSba~jkHuB)>C;ZD'Vrfv#...,cipher,fakeaes,95
47,8758,WX CONTESTE 1 9999 23/16:00\nMETAR 25603 ANTE...,plain,plain,
677,94764,^�qqsuPa`aheg:qw}:q~Pv]x\t|v_}qPcccy:]PPvvv...,cipher,caesar,48


In [92]:
dummy.to_pickle('../data/dummy.pkl')