# Generating web server logs

In [49]:
#Importing Libraries
import csv
import random
import datetime

In [50]:
#Status codes and viewer URL's
status_codes= [200, 201, 400, 401, 403, 404, 500] 
urls= [
    "/cycling-road/",
    "/archery/",
    "/athletics/",
    "/boxing/",
    "/swimming/",
    "/judo/",
    "/karate/",
    "/tennis/"
]

In [51]:
#Generating random IP adresses
def generate_ip_address():
    return f'{random.randint(1,255)}.{random.randint(1,255)}.{random.randint(1,255)}.{random.randint(1,255)}'


In [52]:
#Generting random time
def generate_timestamp():
    current_time =datetime.datetime.now()
    random_time =current_time - datetime.timedelta(seconds= random.randint(1, 86400))
    return random_time.strftime("%H:%M:%S %z")

In [53]:
#Generating web server log entries
def generate_log_entry():
    ip_address = generate_ip_address()
    timestamp =  generate_timestamp()  
    http_method = 'GET'  
    url= random.choice(urls)
    http_version = 'Https://www.Olymipcs.com/watch'
    status_code = random.choice(status_codes)
    return [ip_address,timestamp, f'"{http_version} {url} {http_method}"', status_code]

In [54]:
#Number of entries in CSV file
num_entries = 3000

In [55]:
#Generate Lig entries
log_entries =[generate_log_entry() for i in range(num_entries)]

In [56]:
#Saving web server logs as CSV
with open('web_server_logs.csv', 'w', newline='') as f:
    writer= csv.writer(f)
    writer.writerow(['IP_Address', 'Timestamp', 'Request','Status'])
    writer.writerows(log_entries)

# Data Ingestion

In [57]:
#Importing libraries
import pandas as pd

In [58]:
#loading dataset into dataframe
df = pd.read_csv('web_server_logs.csv')

# Exploratory data analysis

In [59]:
#Checking data frame head
df.head()

Unnamed: 0,IP_Address,Timestamp,Request,Status
0,253.202.33.177,03:06:48,"""Https://www.Olymipcs.com/watch /judo/ GET""",404
1,98.197.184.19,23:53:04,"""Https://www.Olymipcs.com/watch /swimming/ GET""",400
2,252.97.91.52,13:14:08,"""Https://www.Olymipcs.com/watch /judo/ GET""",403
3,198.11.16.170,20:34:18,"""Https://www.Olymipcs.com/watch /cycling-road/...",401
4,235.231.210.8,02:37:45,"""Https://www.Olymipcs.com/watch /archery/ GET""",403


In [60]:
#Getting dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   IP_Address  3000 non-null   object
 1   Timestamp   3000 non-null   object
 2   Request     3000 non-null   object
 3   Status      3000 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 93.9+ KB


In [61]:
#Getting dataframe shape
df.shape

(3000, 4)

In [62]:
#Checking for unique values
df.nunique()

IP_Address    3000
Timestamp     2949
Request          8
Status           7
dtype: int64

# Data cleaning

In [63]:
#Checking for duplictes
df.duplicated().sum()

0

In [64]:
#Checking for null values
df.isna().sum()

IP_Address    0
Timestamp     0
Request       0
Status        0
dtype: int64

In [65]:
#Removing excess text from URL's
df["Request"]= df["Request"].str.split("/").str[4]
df.head()

Unnamed: 0,IP_Address,Timestamp,Request,Status
0,253.202.33.177,03:06:48,judo,404
1,98.197.184.19,23:53:04,swimming,400
2,252.97.91.52,13:14:08,judo,403
3,198.11.16.170,20:34:18,cycling-road,401
4,235.231.210.8,02:37:45,archery,403


# Data transformation

In [66]:
#Pulling viewers country of origin from IP address
import geoip2.database

reader= geoip2.database.Reader('GeoLite2-Country.mmdb')
def get_country(ip):
    try:
        response =reader.country(ip)
        return response.country.name
    except:
        return None
    
df['Country']=df['IP_Address'].apply(get_country)    
reader.close()

In [67]:
#Checking for country colum
df.head()

Unnamed: 0,IP_Address,Timestamp,Request,Status,Country
0,253.202.33.177,03:06:48,judo,404,
1,98.197.184.19,23:53:04,swimming,400,United States
2,252.97.91.52,13:14:08,judo,403,
3,198.11.16.170,20:34:18,cycling-road,401,United States
4,235.231.210.8,02:37:45,archery,403,


In [68]:
#Checking for null values once more
df.isna().sum()

IP_Address      0
Timestamp       0
Request         0
Status          0
Country       411
dtype: int64

In [71]:
#Adding column to display the hour of veiwer activity
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hour'] = df['Timestamp'].dt.strftime('%H')
df.head()

Unnamed: 0,IP_Address,Timestamp,Request,Status,Country,Hour
0,253.202.33.177,2023-08-30 03:06:48,judo,404,,3
1,98.197.184.19,2023-08-30 23:53:04,swimming,400,United States,23
2,252.97.91.52,2023-08-30 13:14:08,judo,403,,13
3,198.11.16.170,2023-08-30 20:34:18,cycling-road,401,United States,20
4,235.231.210.8,2023-08-30 02:37:45,archery,403,,2


In [72]:
#Removing null values
df.dropna(subset=['Country'], inplace= True)
df.isna().sum()

IP_Address    0
Timestamp     0
Request       0
Status        0
Country       0
Hour          0
dtype: int64

# Visualizations

In [73]:
import hvplot.pandas
import holoviews as hv

In [74]:
#Top 20 countries
num_countries= 20
countries = df['Country'].value_counts()[:num_countries].hvplot.bar(
            title="Top 20 countries",
            rot=45,
            width= 500,
            height = 400


)
countries

In [75]:
#Sorting table
table = df.hvplot(kind='table',title='Sorting table')
table

In [76]:
#Top 10 Sporting events
num_events= 10
sport_events= df['Request'].value_counts()[:num_events].hvplot.scatter(
                title="Top 10 sporting events",
                rot= 45,
                width=500,
                height= 400

)
sport_events

In [79]:
#Popular view times
view_times = 5
popular_hours = df['Hour'].value_counts()[:view_times].hvplot.bar(
            title= 'Top 5 hours of viewer activity',
            rot= 45,
            height = 500,
            width = 400


)
popular_hours