In [1]:
import boto3
import pandas as pd
from datetime import datetime, timedelta
import pytz
from user_agents import parse
import time
import psycopg2
from sqlalchemy import create_engine

In [13]:
s3 = boto3.resource("s3",
                    endpoint_url="http://172.26.0.3:9000",
                    aws_access_key_id="minioadmin",
                    aws_secret_access_key="miniopassword")
local_tz = pytz.timezone('Asia/Karachi')
now = datetime.now(local_tz)

year, month, day, hour = now.year, now.month, now.day, now.hour - 1
prefix = f"year={year:04d}/month={month:02d}/day={day:02d}/hour={hour:02d}"

bucket_name = 'test-bucket'

while True:
        objects = s3.Bucket(bucket_name).objects.filter(Prefix=f'{prefix}/FULL')
        if any(objects):
            break
        print('Waiting for FULL file to appear...')
        time.sleep(60)

objects = s3.Bucket(bucket_name).objects.filter(Prefix=prefix)
csv_files = [obj.key for obj in objects if obj.key.endswith('.csv')]


dfs = []
for file_name in csv_files:
    obj = s3.Object(bucket_name, file_name)
    body = obj.get()['Body']
    df = pd.read_csv(body, index_col=0)
    dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)

ua_series = combined_df['user_agent'].apply(parse)
combined_df['device_type'] = ua_series.apply(lambda ua: ua.device.family)
combined_df['browser'] = ua_series.apply(lambda ua: ua.browser.family)

In [14]:
combined_df

Unnamed: 0,timestamp,user_cookie,site,user_agent,device_type,browser
0,2023-02-24T06:03:30.071036,59c3f2ef547e479cac354c9166e7f2b4,sport,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Other,Edge
1,2023-02-24T06:08:13.015223,f984861dfa1e4b029f49d84d6c5f597c,sport,"Mozilla/5.0 (iPhone14,3; U; CPU iPhone OS 15_0...",iPhone,Mobile Safari
2,2023-02-24T06:15:52.061433,2d417fb42ce24b8598cc640a8a6a76ae,business,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15....,Other,Firefox
3,2023-02-24T06:02:29.062569,e7ba3993063d4a4a9c35becd6581a375,business,Mozilla/5.0 (Linux; Android 10; SM-G996U Build...,Samsung SM-G996U,Android
4,2023-02-24T06:30:12.098848,6748e78aaa0f4cfcab398ab13c672556,main,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,Mac,Safari
...,...,...,...,...,...,...
546088,2023-02-24T06:41:07.078188,db74373971e3499384195541a4c692f4,fashion,Mozilla/5.0 (Linux; Android 10; SM-G996U Build...,Samsung SM-G996U,Android
546089,2023-02-24T06:58:49.031406,bb277afc0fa14e3da076a804e8af7fff,main,"Mozilla/5.0 (iPhone14,3; U; CPU iPhone OS 15_0...",iPhone,Mobile Safari
546090,2023-02-24T06:24:18.026248,2d417fb42ce24b8598cc640a8a6a76ae,health,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,Other,Chrome
546091,2023-02-24T06:41:08.090589,c2df738b738e46d78c30632b243d83fa,main,Mozilla/5.0 (Linux; Android 9; SM-G973U Build/...,Samsung SM-G973U,Chrome Mobile


In [15]:
metrics_df = combined_df.groupby(['site', 'device_type', 'browser']).agg({
    'user_cookie': 'nunique',
    'timestamp': 'count',
})


In [16]:
metrics_df = metrics_df.reset_index()

In [17]:
metrics_df

Unnamed: 0,site,device_type,browser,user_cookie,timestamp
0,business,Mac,Safari,5259,7146
1,business,Other,Chrome,5183,7083
2,business,Other,Edge,5156,7046
3,business,Other,Firefox,5209,7071
4,business,Samsung SM-G973U,Chrome Mobile,5287,7211
...,...,...,...,...,...
65,tech,Samsung SM-G996U,Android,5198,7128
66,tech,Samsung SM-S906N,Chrome Mobile WebView,5235,7179
67,tech,Spider,Googlebot,5154,7093
68,tech,iPhone,Chrome Mobile iOS,5276,7218


In [18]:
metrics_df = metrics_df.rename(columns={'user_cookie': 'unique_users', 'timestamp': 'page_views'})

In [19]:
metrics_df

Unnamed: 0,site,device_type,browser,unique_users,page_views
0,business,Mac,Safari,5259,7146
1,business,Other,Chrome,5183,7083
2,business,Other,Edge,5156,7046
3,business,Other,Firefox,5209,7071
4,business,Samsung SM-G973U,Chrome Mobile,5287,7211
...,...,...,...,...,...
65,tech,Samsung SM-G996U,Android,5198,7128
66,tech,Samsung SM-S906N,Chrome Mobile WebView,5235,7179
67,tech,Spider,Googlebot,5154,7093
68,tech,iPhone,Chrome Mobile iOS,5276,7218


In [20]:
db_host = "172.26.0.2"
db_name = "mydb"
db_user = "myuser"
db_password = "mypassword"
db_port = "5432"

conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password,
    port=db_port
)

engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')

with engine.connect() as con:
    con.execute(f"CREATE TABLE {table_name} (site VARCHAR (255), device_type VARCHAR (255), browser VARCHAR (255), unique_users INTEGER, page_views INTEGER)")

metrics_df.to_sql(table_name, engine, if_exists='append', index=False)

conn.close()