<a href="https://colab.research.google.com/github/Masupa/Daily-Learning/blob/main/AnalyzingLogFile/AnalyzingLogFiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()

Saving new_log to new_log (1)


In [None]:
import io
import re
import pytz
from datetime import datetime

**Helper Functions - Regular Expressions**

In [None]:
def parse_str(string):
  """
  Returns a string delimited by two characters
  Example:
    >>> 'parse_str('[My_string]')
    return value: 'My_string'
  """

  if string is None:
    return '-'

  try:
    return string[1: -1]
  except ValueError:
    return '-'

In [None]:
def parse_int(string):
  """
  Returns parsed string if no error occured, else 0
  """

  if string is None:
    return 0
  try:
    y = int(string)
    return y
  except ValueError:
    return 0

In [None]:
def parse_datetime(string):
  """
  Parses datetime with timezone formatted as `[day/month/year:hour:minute:second zone]`
  
  Example:
    `>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
    return value: datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)
  
  Due to problems parsing the timezone (`%z`) with `datetime.strptime`, the
  timezone will be obtained using the `pytz` library.
  """

  try:
    dt = datetime.strptime(string[1:-7], '%d/%b/%Y:%H:%M:%S')
    dt_tz = int(string[-6:-3]) * 60 + int(string[-3:-1])
    return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))

  except ValueError:
    return '-'

**Load logfile dataset**

In [None]:
log_data = pd.read_csv(
    io.BytesIO(uploaded['new_log']),
    error_bad_lines=False,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0, 3, 4, 5, 6, 7, 8],
    names=['Client_IP_Address', 'Time_of_activity', 'Requested_Page', \
           'Status_Code', 'Size_of_Page', 'Referer_Page', 'Client_Device'],
    converters={'Time_of_activity': parse_datetime,
                'Requested_Page': parse_str,
                'Status_Code': parse_int,
                'Size_of_Page': parse_int,
                'Referer_Page': parse_str,
                'Client_Device': parse_str},
                )

In [None]:
log_data.head()

Unnamed: 0,Client_IP_Address,Time_of_activity,Requested_Page,Status_Code,Size_of_Page,Referer_Page,Client_Device
0,109.169.248.247,2015-12-12 18:25:11+01:00,GET /administrator/ HTTP/1.1,200,4263,,Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20...
1,109.169.248.247,2015-12-12 18:25:11+01:00,POST /administrator/index.php HTTP/1.1,200,4494,http://almhuette-raith.at/administrator/,Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20...
2,46.72.177.4,2015-12-12 18:31:08+01:00,GET /administrator/ HTTP/1.1,200,4263,,Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20...
3,46.72.177.4,2015-12-12 18:31:08+01:00,POST /administrator/index.php HTTP/1.1,200,4494,http://almhuette-raith.at/administrator/,Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20...
4,83.167.113.100,2015-12-12 18:31:25+01:00,GET /administrator/ HTTP/1.1,200,4263,,Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20...


**Solutions**

1. Find 10 people who visited the site frequently, show the information that identify the people and state why you identify these people as frequent visitors.

In [None]:
# Groupby Client_IP_Address & Client Device and aggregate by count
user_groupby = log_data.groupby(by=['Client_IP_Address', 'Client_Device'])[['Time_of_activity']].count()

# Sort "user_groupby" DataFrame in descending order
top_10_users_sorted = user_groupby.sort_values(by='Time_of_activity', ascending=False).iloc[:10]
top_10_users_sorted.columns = ['Number_of_time_website_accessed']

top_10_users_sorted.reset_index()

Unnamed: 0,Client_IP_Address,Client_Device,Number_of_time_website_accessed
0,205.167.170.15,Go-http-client/1.1,29516
1,79.142.95.122,Mozilla/5.0 (Windows NT 5.1; rv:29.0) Gecko/20...,3205
2,205.167.170.15,python-requests/1.2.3 CPython/2.7.5 Linux/3.14...,1733
3,37.1.206.196,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,1428
4,91.200.12.22,Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.9....,551
5,213.150.254.81,Mozilla/5.0 (compatible; MSIE 10.0; Windows NT...,434
6,84.112.161.41,Mozilla/5.0 (Linux; Android 5.0.2; HTC_One Bui...,414
7,205.167.170.15,Mozilla/5.0 (X11; Linux x86_64; rv:44.0) Gecko...,413
8,84.58.165.21,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; ...,381
9,52.22.118.215,Python-urllib/1.17,367


**2. Show at least five sessions and the page views per each session.**

In [None]:
# Groupby variables of interest
sessions_per_page = log_data.groupby(by=['Client_IP_Address', 'Client_Device', 'Time_of_activity', 'Requested_Page']).count()[['Status_Code']].reset_index()

# Drop "Status_Code" column
sessions_per_page.drop("Status_Code", axis=1, inplace=True)

In [None]:
sessions_per_page.head(20)

Unnamed: 0,Client_IP_Address,Client_Device,Time_of_activity,Requested_Page
0,1.0.176.215,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,2016-04-05 11:40:32+02:00,GET /apache-log/access.log HTTP/1.1
1,1.0.176.215,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,2016-04-05 11:40:55+02:00,GET /apache-log/access.log HTTP/1.1
2,1.0.176.215,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,2016-04-05 11:41:53+02:00,GET /apache-log/access.log HTTP/1.1
3,1.0.176.215,Opera/9.80 (Windows NT 6.2; Win64; x64) Presto...,2016-04-05 11:41:54+02:00,GET /wp-login.php HTTP/1.1
4,1.0.176.215,Opera/9.80 (Windows NT 6.2; Win64; x64) Presto...,2016-04-05 11:42:30+02:00,GET /apache-log/access.log HTTP/1.1
5,1.0.176.215,Opera/9.80 (Windows NT 6.2; Win64; x64) Presto...,2016-04-05 11:42:31+02:00,GET /index.php?option=com_easyblog&view=dashbo...
6,1.0.176.241,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2016-03-22 07:51:02+01:00,GET /apache-log/access.log HTTP/1.1
7,1.0.176.241,Opera/9.80 (Windows NT 6.2; Win64; x64) Presto...,2016-03-22 07:51:03+01:00,GET /wp-login.php HTTP/1.1
8,1.0.176.241,Opera/9.80 (Windows NT 6.2; Win64; x64) Presto...,2016-03-22 07:51:24+01:00,GET /apache-log/access.log HTTP/1.1
9,1.0.176.241,Opera/9.80 (Windows NT 6.2; Win64; x64) Presto...,2016-03-22 07:51:24+01:00,GET /index.php?option=com_easyblog&view=dashbo...


**3. Show five frequent pages which the users visit before visiting this particular web site.**

In [None]:
# Drop rows with NaN values
log_data.dropna(how='any', inplace=True)

# Adding "www." to "Referer_Page" urls without it
full_url = log_data[log_data['Referer_Page'].str.contains('www.') == False]['Referer_Page'].str.replace('http://', 'http://www.')

In [None]:
def make_full_url(url):
  """
  url: Url representing the Referer_Page
  
  Example:
    url: "http://google.com"
    return: "http://www.google.com
  """

  if url.startswith('http://www'):
    pass
  else:
    return url.replace('http://', 'http://www.')

In [None]:
# Apply "make_full_url" to get full url pages
log_data['Referer_Page'] = log_data['Referer_Page'].apply(make_full_url)

In [None]:
# Referer Pages by counts
referer_pages = pd.DataFrame(log_data['Referer_Page'].value_counts())

# Setting an index
referer_pages.reset_index(inplace=True)

# Changing column names
referer_pages.columns = ['Referer_Pages', 'Count']

In [None]:
referer_pages.head(10)

Unnamed: 0,Referer_Pages,Count
0,http://www.almhuette-raith.at/administrator/,53668
1,http://www.almhuette-raith.at/,2170
2,http://www.almhuette-raith.at/administrator/in...,1630
3,http://www.almhuette-raith.at/wp-login.php,393
4,http://www.almhuette-raith.at/index.php?option...,328
5,https://search.yahoo.com/search=almhuette-rait...,243
6,https://www.bing.com/search?q=raith%20h%C3%BCt...,73
7,almhuette-raith.at,51
8,https://www.google.at/,51
9,http://www.top1-seo-service.com/try.php?u=http...,41


*Referer Websites*

In [None]:
def get_website(url):
  """
  url: Url representing a referer page
  Example:
    url: 'http://www.google.com/images'
    return: 'http://www.google.com'
  """

  if url is not None:
    url = url.strip('http://')
    strip_index = url.find('/')
    stripped_url = url[:strip_index + 1]
    return 'http://' + stripped_url
  else:
    pass

print(get_website("http://www.almhuette-raith.at/wp-login.php"))

http://www.almhuette-raith.at/


In [None]:
# Home Page of Referer Page
log_data['Referer_Home_Page'] = log_data['Referer_Page'].apply(get_website)

In [None]:
# Most Frequent Referer Homes Pages
referer_websites = pd.DataFrame(log_data['Referer_Home_Page'].value_counts())

# Setting an index
referer_websites.reset_index(inplace=True)

# Changing column names
referer_websites.columns = ['Referer_Websites', 'Count']

In [None]:
referer_websites.head(20)

Unnamed: 0,Referer_Websites,Count
0,http://www.almhuette-raith.at/,56450
1,http://,2676
2,http://s:/,548
3,http://www.top1-seo-service.com/,41
4,http://www.booking.almenland.at/,40
5,http://www.v2.subscene.com/,21
6,http://www.whois.domaintools.com/,8
7,http://www.gazoblok.net.ua/,8
8,http://www.hyipmanager.in/,5
9,http://www.r.search.yahoo.com/,5


In [None]:
log_data.tail(50)

Unnamed: 0,Client_IP_Address,Time_of_activity,Requested_Page,Status_Code,Size_of_Page,Referer_Page,Client_Device,Referer_Home_Page
191641,213.150.1.137,2016-04-14 15:25:09+02:00,GET /images/phocagallery/almhuette/thumbs/phoc...,200,4264,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191642,213.150.1.137,2016-04-14 15:25:09+02:00,GET /images/phocagallery/almhuette/thumbs/phoc...,200,4064,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191643,213.150.1.137,2016-04-14 15:25:09+02:00,GET /images/phocagallery/almhuette/thumbs/phoc...,200,4268,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191644,213.150.1.137,2016-04-14 15:25:09+02:00,GET /components/com_phocagallery/assets/js/sha...,200,5236,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191645,213.150.1.137,2016-04-14 15:25:09+02:00,GET /components/com_phocagallery/assets/js/sha...,200,3495,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191646,213.150.1.137,2016-04-14 15:25:09+02:00,GET /components/com_phocagallery/assets/js/sha...,200,2337,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191647,213.150.1.137,2016-04-14 15:25:10+02:00,GET /components/com_phocagallery/assets/js/sha...,200,8324,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191648,213.150.1.137,2016-04-14 15:25:10+02:00,GET /components/com_phocagallery/assets/images...,200,174,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191649,213.150.1.137,2016-04-14 15:25:10+02:00,GET /components/com_phocagallery/assets/js/sha...,200,255,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
191650,213.150.1.137,2016-04-14 15:25:10+02:00,GET /components/com_phocagallery/assets/js/sha...,200,248,,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,
