In [1]:
import pandas as pd

In [2]:
nl2bash = pd.read_json('nl2bash-data.json', orient='index')

In [3]:
nl2bash.cmd = nl2bash.cmd.apply(lambda x: x[2:] if x.startswith('$ ') else x)

In [4]:
set(nl2bash.cmd)

{'find . -path "./sr*sc"',
 "find . -maxdepth 1 -type f -name '\\.*' | sed -e 's,^\\./\\.,,' | sort | xargs -iname mv .name name",
 "find ~ -name '*.txt' -print0 | xargs -0 cat",
 'ls $PWD/cat.wav',
 "ifconfig | grep -i hwaddr | cut -d ' ' -f9",
 'find . -perm 0777 -type d -exec ls -l {} \\;',
 'find -mmin +15 -mmin -25',
 'find / -group managers -print',
 'find $INTRANETDESTINATION/weekly -mtime +32 -exec rm {} \\;',
 'find . \\( -name somedir -prune \\) , \\( -name bin -prune \\) -o \\( -name "*.txt" -print \\)',
 'find /mnt/zip -name "*prefs copy" -print0 | xargs    -0 -p /bin/rm',
 'read -p "Are you sure? " -n 1 -r',
 'find -iname "MyCProgram.c" -exec md5sum {} \\;',
 'find ${path} -P -type f',
 'find . -type f -mtime +31 -print0 | xargs -0 -r rm -f',
 'ifconfig eth0 | grep -Eo ..\\(\\:..\\){5}',
 'find / -type f -exec echo {} \\;',
 'history | awk \'{print $2}\' | awk \'BEGIN {FS="|"}{print $1}\' | sort | uniq -c | sort -n | tail | sort -nr',
 'cp -R SRCFOLDER DESTFOLDER/',
 'find

In [5]:
nl2bash.cmd.nunique(), len(nl2bash)

(8525, 10347)

In [6]:
import bashlex

In [7]:
def try_parse(x):
    try:
        return bashlex.parse(x)
    except:
        return None

def recurrent_command_extractor_generator(x):
#     print(f'parsing node {x}')
    if x.kind == 'command':
#         print(f'\t parsed as command containing {x.parts[0].word}')
        yield x.parts[0].word
    elif x.kind == 'commandsubstitution':
#         print(f'\t parsed as commandsubstitution containing {x.command}')
        yield from recurrent_command_extractor_generator(x.command)
    if hasattr(x, 'parts'):
#         print(f'\t gone further into parts')
        for part in x.parts:
            yield from recurrent_command_extractor_generator(part)
#     print('considered leaf')
    
def extract_commands(x):
    parsed = try_parse(x)
    if parsed is None:
        return []
    
    return [c for c in recurrent_command_extractor_generator(parsed[0])]

In [8]:
nl2bash['cmdset'] = nl2bash.cmd.apply(lambda x: set(extract_commands(x)))

In [9]:
with open('all_commands.txt') as f:
    possible_commands = [s.strip() for s in f.readlines()]

In [10]:
selector = nl2bash.cmdset.apply(lambda x: len(x & set(possible_commands)) < len(set([i.replace('$', '') for i in x]) & set(possible_commands)))

In [11]:
nl2bash.loc[selector, nl2bash.columns=='cmd'] = nl2bash[selector].cmd.apply(lambda x: x[1:])

In [12]:
nl2bash['cmdset'] = nl2bash.cmd.apply(lambda x: set(extract_commands(x)))

In [13]:
', '.join(nl2bash.cmdset.explode().value_counts().head(70).index)

'find, xargs, grep, sort, awk, sed, echo, cut, cat, head, wc, tr, sudo, ls, tail, uniq, mkdir, rsync, split, read, dirname, which, readlink, date, pwd, diff, ssh, tee, cd, df, chown, ln, comm, set, mount, ifconfig, basename, od, md5sum, rev, history, shopt, hostname, mktemp, column, tar, nl, yes, ping, dig, mv, whoami, seq, join, paste, cp, egrep, tac, less, cpio, cal, fold, du, who, gzip, rm, tree, more, chmod, uname'

In [14]:
len(set(nl2bash.cmdset.explode().values))

444

In [15]:
import numpy as np
import yaml

In [16]:
with open('cathegories.yml') as f:    
    cats = yaml.safe_load(f)
        
back_dict = {}
for cath in cats:
    for util in cath['utils']:
        back_dict[util] = cath['name']
        
print(len(set(back_dict.keys())), len(back_dict.keys()))
print('cathegorized, not listed:', 
      set(back_dict.keys()) - (set(nl2bash.cmdset.explode().values) & set(possible_commands)))
print('listed, not cathegorized: ', 
      (set(nl2bash.cmdset.explode().values) & set(possible_commands)) - set(back_dict.keys()))

119 119
cathegorized, not listed: {'free', 'md5', 'cal', 'tree', 'uptime', 'rename'}
listed, not cathegorized:  set()


In [17]:
# manually add actually normal commands, that were not listed somehow
possible_commands += list(set(back_dict.keys()) - (set(nl2bash.cmdset.explode().values) & set(possible_commands)))
possible_commands = set(possible_commands)

In [18]:
nl2bash['cmdcath'] = nl2bash.cmdset.apply(lambda x: set([back_dict[i] for i in x if i in back_dict.keys()]))

In [19]:
nl2bash.cmdcath.apply(len).value_counts()

1    7337
2    2701
3     293
0       9
4       7
Name: cmdcath, dtype: int64

In [20]:
nl2bash[nl2bash.cmdcath.apply(len) == 0]

Unnamed: 0,invocation,cmd,cmdset,cmdcath
482,"Compresses all files listed in array $*, execu...",compress $* &,{compress},{}
1395,Execute /usr/bin/find with $* arguments,/usr/bin/find $*,{/usr/bin/find},{}
1396,Execute /usr/bin/find with $* arguments where ...,/usr/bin/find ./ $*,{/usr/bin/find},{}
1548,Find a.out and object files in the current dir...,find . ( -name a.out -o -name *.o ) -print,{},{}
4397,"Lookup information for user ""vivek""",finger vivek,{finger},{}
6153,Remove all files containing 'sample' (case ins...,"/usr/bin/find /home/user/Series/ -iname ""*samp...",{/usr/bin/find},{}
8296,delete all the files in the current folder whi...,find . ( -name '*.bak' -o -name *.backup ) -ty...,{},{}
9292,find all the files in the current folder which...,find . ( -name a.out -o -name *.o ) -print,{},{}
9343,find all the files in the file system which ha...,find / \( -perm -4000 -fprintf /root/suid.t...,{},{}


In [21]:
# filter out those empty commands, they are in small number and weird anyway
nl2bash = nl2bash[nl2bash.cmdcath.apply(len) > 0]

In [22]:
nl2bash.cmdset = nl2bash.cmdset.apply(lambda x: x & set(possible_commands))

In [23]:
nl2bash.cmdset.apply(len).value_counts()

1    6780
2    2576
3     720
4     203
5      48
6      11
Name: cmdset, dtype: int64

In [24]:
nl2bash.cmdcath.apply(len).value_counts()

1    7337
2    2701
3     293
4       7
Name: cmdcath, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split

In [43]:
unique_commands = nl2bash.drop_duplicates(subset='cmd', keep='first')[['cmd', 'cmdset']]

In [79]:
commands_popularity = unique_commands.cmdset.explode().value_counts()
commands_to_exclude = commands_popularity[commands_popularity == 1]
commands_popularity = commands_popularity[commands_popularity > 1]

In [85]:
unique_commands = unique_commands[~unique_commands.cmdset.apply(lambda x: len(x & set(commands_to_exclude.index))>0)]

In [88]:
unique_commands['least_popular_command'] = unique_commands.cmdset.apply(lambda x: commands_popularity[commands_popularity.index.isin(x)].idxmin())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_commands['least_popular_command'] = unique_commands.cmdset.apply(lambda x: commands_popularity[commands_popularity.index.isin(x)].idxmin())


In [92]:
pd.DataFrame({'all_cmd': unique_commands.cmdset.explode().value_counts(), 
              'least_pop_cmd': unique_commands.least_popular_command.value_counts()})

Unnamed: 0,all_cmd,least_pop_cmd
apropos,6,6
awk,382,92
basename,54,49
bash,10,10
bg,4,4
...,...,...
whoami,38,34
xargs,993,695
yes,47,31
zcat,32,32


In [99]:
commands_train, commands_test = train_test_split(unique_commands['cmd'], test_size=1200, random_state=42,
                                                 stratify=unique_commands['least_popular_command'])

In [102]:
train_set = nl2bash[nl2bash.cmd.isin(commands_train.values)]
test_set = nl2bash[nl2bash.cmd.isin(commands_test.values)]

In [106]:
train_set.to_json('train_filtered_nl2bash.json', orient='records', lines=True)
train_set

Unnamed: 0,invocation,cmd,cmdset,cmdcath
1,"Copy loadable kernel module ""mymodule.ko"" to t...",sudo cp mymodule.ko /lib/modules/$(uname -r)/k...,"{sudo, uname}","{SYSINFO, SECOP}"
2,"Display all lines containing ""IP_MROUTE"" in th...",cat /boot/config-`uname -r` | grep IP_MROUTE,"{cat, grep, uname}","{SYSINFO, TXTPROC}"
3,Display current running kernel's compile-time ...,cat /boot/config-`uname -r`,"{cat, uname}","{SYSINFO, TXTPROC}"
4,"Find all loadable modules for current kernel, ...",find /lib/modules/`uname -r` -regex .*perf.*,"{find, uname}","{SYSINFO, FDOPS}"
5,"Look for any instance of ""HIGHMEM"" in the curr...","grep ""HIGHMEM"" /boot/config-`uname -r`","{grep, uname}","{SYSINFO, TXTPROC}"
...,...,...,...,...
10343,using exec in find command to dispaly the sear...,find . ... -exec cat {} \; -exec echo \;,{find},{FDOPS}
10344,verbosely create intermediate directoriy tmp a...,mkdir -pv /tmp/boostinst,{mkdir},{FDOPS}
10345,view the manual page of find,man find,{man},{HELPDOC}
10346,"wait 2 seconds and then print ""hello""","echo ""hello `sleep 2 &`""","{echo, sleep}","{SHUTIL, PROCCTRL}"


In [107]:
test_set.to_json('test_filtered_nl2bash.json', orient='records', lines=True)
test_set

Unnamed: 0,invocation,cmd,cmdset,cmdcath
6,"Search for command ""tail"" in the maps of the p...",cat /proc/2671/maps | grep `which tail`,"{which, cat, grep}","{TXTPROC, HELPDOC}"
8,Display all lines containing UTRACE in the cur...,grep UTRACE /boot/config-$(uname -r),"{grep, uname}","{SYSINFO, TXTPROC}"
11,Abort the shell or script on the first failed ...,set -e,{set},{SHUTIL}
28,"Add directory ""$HOME/Pictures"" to the director...","pushd ""$HOME/Pictures""",{pushd},{FDOPS}
56,"Answer ""y"" to all prompts of ""rm -rf foo""",yes | rm -ri foo,"{rm, yes}","{SHUTIL, FDOPS}"
...,...,...,...,...
10298,"split file ""${fspec} into pieces named as ""xyz...",split --number=l/6 ${fspec} xyzzy.,{split},{SHUTIL}
10305,split file input.txt into pieces per 1 line na...,split --lines=1 --suffix-length=5 input.txt ou...,{split},{SHUTIL}
10319,"ssh into ""ssh.myhost.net"" as user ""myusername""...","ssh myusername@ssh.myhost.net ""mkdir -p $2""",{ssh},{NET}
10341,use regex with find command,find . -regextype posix-egrep -regex '\./[a-f0...,{find},{FDOPS}
