In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from pathlib import Path
import yaml

In [3]:
CONFIG_PATH = "config.yaml"
with open(CONFIG_PATH, "r", encoding="utf-8") as config_file:
    CONFIG = yaml.load(config_file, Loader=yaml.FullLoader)

CON = create_engine(
    CONFIG['database_url']
)

def select(query, con=CON):
    return pd.read_sql(query, con)

In [4]:
q = """
SELECT *
FROM public.user
"""

user_data = select(q)

In [5]:
user_data.head()

Unnamed: 0,id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


In [4]:
user_data.to_csv('user_data.csv', index=False)

In [5]:
q = """
SELECT *
FROM public.post
"""

post_data = select(q)

In [6]:
post_data.to_csv('post_data.csv', index=False)

In [7]:
q = """
WITH q1 (timestamp, user_id, post_id, action, target, num) AS 
(
    SELECT timestamp, user_id, post_id, action, target, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY timestamp DESC) AS num
    FROM public.feed_data
    WHERE action = 'view'
)
SELECT timestamp, user_id, post_id, target
FROM q1
WHERE num <= 35
"""
feed_data = select(q)

In [8]:
feed_data.to_csv('feed_data.csv', index=False)

In [9]:
feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,target
0,2021-12-29 15:24:59,200,1773,0
1,2021-12-29 15:24:31,200,2213,1
2,2021-12-29 15:23:54,200,1122,0
3,2021-12-29 15:23:29,200,1362,0
4,2021-12-29 15:21:53,200,1541,0


In [10]:
data = feed_data.merge(user_data.rename(columns={'id': 'user_id'}), on='user_id').merge(post_data.rename(columns={'id': 'post_id'}), on='post_id')

In [11]:
data.to_csv('data.csv', index=False)