## Read and Parse the logs

In [1]:
with open('../data/auth.log') as f:
    data = f.readlines()

In [2]:
data[:10]

['Mar 27 13:06:56 ip-10-77-20-248 sshd[1291]: Server listening on 0.0.0.0 port 22.\n',
 'Mar 27 13:06:56 ip-10-77-20-248 sshd[1291]: Server listening on :: port 22.\n',
 'Mar 27 13:06:56 ip-10-77-20-248 systemd-logind[1118]: Watching system buttons on /dev/input/event0 (Power Button)\n',
 'Mar 27 13:06:56 ip-10-77-20-248 systemd-logind[1118]: Watching system buttons on /dev/input/event1 (Sleep Button)\n',
 'Mar 27 13:06:56 ip-10-77-20-248 systemd-logind[1118]: New seat seat0.\n',
 'Mar 27 13:08:09 ip-10-77-20-248 sshd[1361]: Accepted publickey for ubuntu from 85.245.107.41 port 54259 ssh2: RSA SHA256:Kl8kPGZrTiz7g4FO1hyqHdsSBBb5Fge6NWOobN03XJg\n',
 'Mar 27 13:08:09 ip-10-77-20-248 sshd[1361]: pam_unix(sshd:session): session opened for user ubuntu by (uid=0)\n',
 'Mar 27 13:08:09 ip-10-77-20-248 systemd: pam_unix(systemd-user:session): session opened for user ubuntu by (uid=0)\n',
 'Mar 27 13:08:09 ip-10-77-20-248 systemd-logind[1118]: New session 1 of user ubuntu.\n',
 'Mar 27 13:09:37

## Testing Pygrok

In [3]:
import pygrok

In [4]:
from pygrok import Grok
text = 'gary is male, 25 years old and weighs 68.5 kilograms'
pattern = '%{WORD:name} is %{WORD:gender}, %{NUMBER:age} years old and weighs %{NUMBER:weight} kilograms'
grok = Grok(pattern)
print(grok.match(text))

{'gender': 'male', 'name': 'gary', 'weight': '68.5', 'age': '25'}


### Following the instructions from here: https://www.elastic.co/blog/grokking-the-linux-authorization-logs

In [5]:
text = 'Feb 21 00:13:35 localhost sshd[7483]: Accepted password for vagrant from 192.168.33.1 port 58803 ssh2'
pattern = '%{SYSLOGTIMESTAMP:system.auth.timestamp} %{SYSLOGHOST:system.auth.hostname} sshd(?:\\[%{POSINT:system.auth.pid}\\])?: %{DATA:system.auth.ssh.event} %{DATA:system.auth.ssh.method} for (invalid user )?%{DATA:system.auth.user} from %{IPORHOST:system.auth.ip} port %{NUMBER:system.auth.port} ssh2(: %{GREEDYDATA:system.auth.ssh.signature})?'
grok = Grok(pattern)
print(grok.match(text))

None


In [6]:
text = 'Feb 21 21:56:12 localhost sshd[3430]: Invalid user test from 10.0.2.2'
pattern = '%{SYSLOGTIMESTAMP:system.auth.timestamp} %{SYSLOGHOST:system.auth.hostname} sshd(?:\\[%{POSINT:system.auth.pid}\\])?: %{DATA:system.auth.ssh.event} user %{DATA:system.auth.user} from %{IPORHOST:system.auth.ip}'
grok = Grok(pattern)
print(grok.match(text))

None


### The above method doesn't seem to work

### Based on Logalyzer - https://github.com/hatRiot/logalyzer

In [7]:
import re

# parse user from various lines
def ParseUsr(line):
    usr = None
    if "Accepted password" in line:
        usr = re.search(r'(\bfor\s)(\w+)', line)
    elif "sudo:" in line:
        usr = re.search(r'(sudo:\s+)(\w+)', line)
    elif "authentication failure" in line:
        usr = re.search(r'(user=)(\w+)', line)
    elif "for invalid user" in line:
        usr = re.search(r'(\buser\s)(\w+)', line)
    if usr is not None:
        return usr.group(2)

# parse an IP from a line
def ParseIP(line):
    if ' from ' in line:
        ip = re.search(r'(\bfrom\s)(\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b)', line)
        if ip is not None:
            return ip.group(2)
    elif ' rhost=' in line:
        ip = re.search(r'(\brhost=)(\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b)', line)
        if ip is not None:
            return ip.group(2)
    else:
        print('ERROR in extracting the IP')

# parse a date from the line
def ParseDate(line):
    date = re.search(r'^[A-Za-z]{3}\s*[0-9]{1,2}\s[0-9]{1,2}:[0-9]{2}:[0-9]{2}', line)
    if date is not None:
        return date.group(0)

# parse a command from a line
def ParseCmd(line):
    # parse command to end of line 
    cmd = re.search(r'(\bCOMMAND=)(.+?$)', line)
    if cmd is not None:
        return cmd.group(2)
    
# parse hostname
def ParseHostname(line):
    # parse hostname
    hostname = re.search(r'(^\w+\ \d+\ \d{2}:\d{2}:\d{2})\ ([\w\-]+)\ ', line)
    if hostname is not None:
        return hostname.group(2)


In [8]:
def get_fail_line_info(line):
    if 'Too many authentication failures' in line:
        return
    elif 'failed adding user' in line:
        return
    else:
        date = ParseDate(line)
        usr = ParseUsr(line)
        ip = ParseIP(line)   
        hostname=ParseHostname(line)
    return ['2017 ' + date, hostname, usr, ip]

In [9]:
lines = [d for d in data if 'fail' in d]
out_matrix = []
for line in lines:
    info = get_fail_line_info(line)
    if info:
        info.append(line)
        out_matrix.append(info)

In [10]:
import pandas as pd
%matplotlib inline

In [11]:
df = pd.DataFrame(out_matrix,columns=['date','hostname','user','ip','logline'])
df['date'] = pd.to_datetime(df['date'],format='%Y %b %d %H:%M:%S')
df.index = df['date']
df = df[['hostname','user','ip','logline']]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 484 entries, 2017-03-29 10:42:43 to 2017-04-20 13:51:02
Data columns (total 4 columns):
hostname    313 non-null object
user        343 non-null object
ip          484 non-null object
logline     484 non-null object
dtypes: object(4)
memory usage: 18.9+ KB


In [13]:
df.head()

Unnamed: 0_level_0,hostname,user,ip,logline
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-03-29 10:42:43,ip-10-77-20-248,elastic_user_7,127.0.0.1,Mar 29 10:42:43 ip-10-77-20-248 sshd[1193]: pa...
2017-03-29 13:07:13,ip-10-77-20-248,elastic_user_0,85.245.107.41,Mar 29 13:07:13 ip-10-77-20-248 sshd[2257]: pa...
2017-03-29 13:41:13,ip-10-77-20-248,elastic_user_2,85.245.107.41,Mar 29 13:41:13 ip-10-77-20-248 sshd[2328]: pa...
2017-03-29 14:15:38,ip-10-77-20-248,,181.25.206.27,Mar 29 14:15:38 ip-10-77-20-248 sshd[2414]: pa...
2017-03-29 14:15:52,ip-10-77-20-248,,181.25.206.27,Mar 29 14:15:52 ip-10-77-20-248 sshd[2414]: PA...


In [14]:
df['user'].unique()

array(['elastic_user_7', 'elastic_user_0', 'elastic_user_2', None,
       'elastic_user_6', 'elastic_user_4', 'root', 'elastic_user_3',
       'elastic_user_8', 'elastic_user_9', 'elastic_user_5',
       'elastic_user_1', 'bin'], dtype=object)

In [15]:
df.to_csv('../data/failed_auth.log.csv')