In [1]:
import os

import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

load_dotenv()


def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine(os.getenv('DATABASE'))
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)

def load_features(select) -> pd.DataFrame:
    return batch_load_sql(select)


In [2]:
query = """
SELECT *
FROM post_text_df
"""
posts_df = load_features(query)

posts_df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [3]:
# Find total likes, likes rate for every post

In [4]:
query = """
SELECT COUNT(*), post_id
FROM feed_data
WHERE action = 'like'
GROUP BY post_id;
"""
posts_likes = load_features(query)
posts_likes = posts_likes.set_index('post_id')

In [5]:
posts_df['total_likes'] = posts_df['post_id'].map(posts_likes['count'])

In [6]:
query = """
SELECT COUNT(*), post_id
FROM feed_data
GROUP BY post_id;
"""
posts_actions = load_features(query)
posts_actions = posts_actions.set_index('post_id')

In [7]:
posts_df['like_rate'] = posts_df['post_id'].map(posts_likes['count']/posts_actions['count'])

In [8]:
posts_df.isnull().sum()

post_id          0
text             0
topic            0
total_likes    192
like_rate      192
dtype: int64

In [9]:
posts_df['like_rate'] = posts_df['like_rate'].fillna(posts_df['like_rate'].mean())
posts_df['total_likes'] = posts_df['total_likes'].fillna(posts_df['total_likes'].mean())

In [10]:
# Add feature - average age for each post

In [11]:
query = """
SELECT AVG(u.age), p.post_id
FROM feed_data as f
JOIN user_data as u
ON f.user_id = u.user_id
JOIN post_text_df as p
ON f.post_id = p.post_id
WHERE action = 'like'
GROUP BY p.post_id
"""
posts_age_likes = load_features(query)
posts_age_likes = posts_age_likes.set_index('post_id')

In [12]:
posts_df['avg_age_likes'] = posts_df['post_id'].map(posts_age_likes['avg'])

In [13]:
posts_df.isnull().sum()

post_id            0
text               0
topic              0
total_likes        0
like_rate          0
avg_age_likes    192
dtype: int64

In [14]:
posts_df['avg_age_likes'] = posts_df['avg_age_likes'].fillna(posts_df['avg_age_likes'].mean())

In [15]:
# Add feature - total length for each post (symbols)

In [16]:
query = """
SELECT post_id, LENGTH(text) AS post_len
FROM post_text_df;
"""
posts_len = load_features(query)
posts_len = posts_len.set_index('post_id')

In [17]:
posts_df['post_len'] = posts_df['post_id'].map(posts_len['post_len'])

In [18]:
posts_df

Unnamed: 0,post_id,text,topic,total_likes,like_rate,avg_age_likes,post_len
0,1,UK economy facing major risks\n\nThe UK manufa...,business,1067.0,0.111436,29.099344,1967
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,637.0,0.078333,31.910518,2701
2,3,Asian quake hits European shares\n\nShares in ...,business,1122.0,0.117684,29.262923,3408
3,4,India power shares jump on debut\n\nShares in ...,business,1171.0,0.125523,29.084543,1026
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,1153.0,0.118426,29.479618,889
...,...,...,...,...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie,2619.0,0.133903,22.554792,803
7019,7316,I give this movie 2 stars purely because of it...,movie,677.0,0.093392,31.441654,800
7020,7317,I cant believe this film was allowed to be mad...,movie,731.0,0.097027,31.885089,636
7021,7318,The version I saw of this film was the Blockbu...,movie,680.0,0.091092,32.423529,728


In [19]:
# Add feature - average sentence length (len text in words / sentences count)

In [19]:
query = """
SELECT 
    post_id,
    text,
    array_length(regexp_split_to_array(trim(regexp_replace(text_column, '\s+', ' ', 'g')), ' '), 1)
    AS word_count
FROM post_text_df
"""