In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 2
memory_gb = 4
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [36]:
logs = sc.textFile("Downloads/auth_log/")
# logs.take(50)

### How many failed attempts to access the server as root are there?

#### Total failed passwords for root:

In [37]:
failedroot = logs.filter(lambda line: "Failed password for root" in line)
norepeat = failedroot.filter(lambda line: "message repeated" not in line)
messagerepeated = failedroot.filter(lambda line: "message repeated" in line)
#get values of x for every "message repeated x times" and sum them
repeatcount = messagerepeated.map(lambda line: int(line.split(":")[3].split(" ")[3])).sum()
#total regular "failed password" messages + sum(x) for all "message repeated x times" messages
total = norepeat.count()+repeatcount
print("Total failed password attempts for root: "+str(total))

Total failed password attempts for root: 41045


#### Total failed authentications for root:

In [38]:
failedauth = logs.filter(lambda line: ("authentication failure" in line) and ("user=root" in line))
print("total authentication failures for root: "+str(failedauth.count()))

total authentication failures for root: 26376


As I understand this each authentication failure is an overall login attempt that can be comprised of 1-3 password attempts (in the case of this log) made in the same connection instance.  Each authentication failure appears to share an sshd number and IP address with its associated set of password attempts.  My take is that both password and authentication failures could be used as proxies for access attempts.       

#### List of countries from which these attempts were carried out:

In [30]:
import geoip2.database
def IPtoCountry(IP):
    try:
        with geoip2.database.Reader('Downloads/GeoLite2-Country_20201222/GeoLite2-Country.mmdb') as reader:
            response = reader.country(IP).country.name
        return response
    except:
        return "Invalid IP"
    

In [32]:
#IP addresses from authentication failures
failedrootIP = failedauth.map(lambda line: line.split("=")[6].split(" ")[0])
#map country names to IP addresses
failedrootcountry = failedrootIP.map(lambda line: (IPtoCountry(line), line))

Countries by number of authentication failures:

In [35]:
countryfreq = failedrootcountry.countByKey()
from collections import Counter
Counter(countryfreq).most_common()

[('China', 16428),
 ('United States', 1880),
 ('France', 1114),
 ('Germany', 683),
 ('India', 620),
 ('Singapore', 542),
 ('Hong Kong', 463),
 ('Brazil', 459),
 ('South Korea', 454),
 ('Netherlands', 392),
 ('Russia', 355),
 ('United Kingdom', 249),
 ('Canada', 230),
 ('Indonesia', 217),
 ('Vietnam', 201),
 ('Argentina', 182),
 ('Japan', 151),
 ('Colombia', 111),
 ('Mexico', 103),
 ('Poland', 99),
 ('Italy', 92),
 ('Thailand', 90),
 ('Taiwan', 75),
 ('Chile', 63),
 ('Hungary', 55),
 ('Bulgaria', 55),
 ('Philippines', 52),
 ('Iran', 49),
 ('Spain', 49),
 ('Malaysia', 48),
 ('Turkey', 46),
 ('Venezuela', 44),
 ('South Africa', 41),
 ('Sweden', 40),
 ('Croatia', 40),
 ('Uzbekistan', 38),
 ('Portugal', 35),
 ('Australia', 34),
 ('Romania', 33),
 ('Pakistan', 29),
 ('Lithuania', 27),
 ('Ukraine', 25),
 ('United Arab Emirates', 22),
 ('Egypt', 21),
 ('Bangladesh', 18),
 ('Tunisia', 17),
 ('Uruguay', 16),
 ('Serbia', 15),
 ('Paraguay', 15),
 ('Nigeria', 15),
 ('Panama', 14),
 ('Czechia', 14),

### How many failed attempts to access the server as non-root users are there? 

#### Total failed passwords for non-root:

In [40]:
failednonroot = logs.filter(lambda line: ("Failed password" in line) and ("for root" not in line))
# #No "message repeated" messages for non root users
total_ = failednonroot.count()
print("Total failed password attempts for non-root users: "+str(total_))

Total failed password attempts for non-root users: 12444


#### Total failed authentications for non-root:

In [41]:
failedauth_ = logs.filter(lambda line: ("authentication failure" in line) and \
                          (not "user=root" in line))
print("total authentication failures for non-root: "+str(failedauth_.count()))


total authentication failures for non-root: 12465


#### List of attempted usernames:

In [42]:
usernames = failednonroot.map(lambda line: (line.split(":")[3].split(" ")[6], line))

In [43]:
usernamefreq = usernames.countByKey()
Counter(usernamefreq).most_common()

[('admin', 345),
 ('test', 274),
 ('pi', 182),
 ('oracle', 122),
 ('user', 103),
 ('ubuntu', 79),
 ('guest', 68),
 ('nagios', 67),
 ('git', 65),
 ('ftpuser', 61),
 ('student', 60),
 ('support', 48),
 ('mysql', 45),
 ('hadoop', 42),
 ('ftp', 42),
 ('operator', 41),
 ('deploy', 41),
 ('testuser', 40),
 ('tomcat', 39),
 ('vnc', 37),
 ('info', 37),
 ('ubnt', 35),
 ('tester', 32),
 ('service', 31),
 ('testftp', 30),
 ('user1', 27),
 ('test2', 27),
 ('minecraft', 27),
 ('apache', 25),
 ('toor', 25),
 ('temp', 24),
 ('webmaster', 23),
 ('demo', 23),
 ('zabbix', 22),
 ('teamspeak', 22),
 ('web', 22),
 ('linux', 21),
 ('ts3', 20),
 ('system', 19),
 ('squid', 19),
 ('server', 19),
 ('art', 19),
 ('rpm', 19),
 ('deployer', 18),
 ('Administrator', 18),
 ('gpadmin', 18),
 ('ftpadmin', 18),
 ('test1', 18),
 ('cpanel', 17),
 ('alex', 17),
 ('administrator', 16),
 ('vagrant', 16),
 ('debian', 16),
 ('testing', 16),
 ('support1', 16),
 ('public', 15),
 ('webalizer', 15),
 ('download', 15),
 ('jboss', 1

#### List of countries from which these attempts were carried out:

In [44]:
#IP addresses from authentication failures
failednonrootIP = failedauth_.map(lambda line: line.split("=")[6].split(" ")[0])
#map country names to IP addresses
failednonrootcountry = failednonrootIP.map(lambda line: (IPtoCountry(line), line))

In [45]:
country_freq = failednonrootcountry.countByKey()
Counter(country_freq).most_common()

[('China', 4244),
 ('United States', 1580),
 ('France', 881),
 ('India', 516),
 ('Singapore', 489),
 ('Brazil', 486),
 ('South Korea', 416),
 ('Germany', 404),
 ('Russia', 371),
 ('Hong Kong', 249),
 ('Netherlands', 243),
 ('United Kingdom', 227),
 ('Canada', 220),
 ('Vietnam', 198),
 ('Indonesia', 136),
 ('Italy', 106),
 ('Colombia', 101),
 ('Argentina', 99),
 ('Japan', 95),
 ('Mexico', 88),
 ('Poland', 78),
 ('Thailand', 74),
 ('Taiwan', 65),
 ('Malaysia', 52),
 ('Lithuania', 52),
 ('Panama', 47),
 ('Iran', 46),
 ('Ukraine', 45),
 ('Spain', 45),
 ('Sweden', 36),
 ('Pakistan', 34),
 ('Romania', 30),
 ('Philippines', 30),
 ('Portugal', 27),
 ('Hungary', 26),
 ('Bolivia', 25),
 ('Croatia', 25),
 ('Venezuela', 23),
 ('Belgium', 23),
 ('Australia', 23),
 ('Paraguay', 22),
 ('Ecuador', 22),
 ('Czechia', 22),
 ('Turkey', 19),
 ('South Africa', 19),
 ('United Arab Emirates', 18),
 ('Chile', 18),
 ('Tunisia', 17),
 ('Uzbekistan', 15),
 ('Bangladesh', 14),
 ('Egypt', 13),
 ('Nigeria', 13),
 ('

### Which date has the highest number of attack attempts?

Here I use authentication failures as a proxy for attacks.

In [47]:
combinedattacks = logs.filter(lambda line: ("authentication failure" in line))
#combinedattacks.take(10)

In [48]:
dates = combinedattacks.map(lambda line: (line[0:6],line))
#dates.take(10)

Dates by number of attacks:

In [49]:
datesfreq = dates.countByKey()
Counter(datesfreq).most_common()

[('Dec  8', 1969),
 ('Dec  6', 1852),
 ('Dec  7', 1771),
 ('Dec  3', 1544),
 ('Dec  4', 1487),
 ('Nov 27', 1464),
 ('Dec  2', 1429),
 ('Dec  5', 1427),
 ('Nov 24', 1390),
 ('Dec 13', 1367),
 ('Nov 28', 1288),
 ('Dec 11', 1282),
 ('Nov 22', 1236),
 ('Dec  9', 1221),
 ('Nov 17', 1219),
 ('Nov 23', 1183),
 ('Nov 29', 1168),
 ('Dec 12', 1165),
 ('Dec 15', 1160),
 ('Nov 25', 1113),
 ('Nov 26', 1071),
 ('Dec 10', 1062),
 ('Nov 16', 1061),
 ('Dec  1', 1029),
 ('Dec 16', 964),
 ('Nov 20', 954),
 ('Nov 21', 927),
 ('Dec 14', 886),
 ('Nov 19', 852),
 ('Nov 15', 793),
 ('Nov 30', 764),
 ('Nov 18', 743)]

December 8th was the day FireEye disclosed it was attacked - it's perhaps plausible that the high numbers on 12/6-12/8 could be related to the SolarWinds attack.  