In [1]:
# Python
import os
import re  

# Connection to the DB
from sqlalchemy import create_engine

# Data handling and text preprocessing  
import pandas as pd  

# Machine learning and feature extraction  
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.ensemble import RandomForestRegressor  
from sklearn.metrics import mean_squared_error

In [2]:
USER = os.getenv('POSTGRES_USER')
PASSWORD = os.getenv('POSTGRES_PASSWORD')
DATABASE = os.getenv('POSTGRES_WAREHOUSE_DB')
HOST = os.getenv('POSTGRES_HOST')
HOST_PORT = os.getenv('POSTGRES_PORT')

print(
    f"""
    USER: {USER}
    PASSWORD: {PASSWORD}
    DATABASE: {DATABASE}
    HOST: {HOST}
    HOST_PORT: {HOST_PORT}
    """
)


    USER: postgres
    PASSWORD: password123
    DATABASE: smogsense
    HOST: postgres
    HOST_PORT: 5432
    


In [3]:
db_address = f'postgresql://{USER}:{PASSWORD}@{HOST}:{HOST_PORT}/{DATABASE}'
conn = create_engine(db_address)

In [4]:
# --- Data Extraction ---
# SQL query: join tweets with their engagement metrics
query = """
SELECT t.text, 
       e.like_count, 
       e.retweet_count, 
       e.quote_count, 
       e.reply_count, 
       e.view_count
FROM social_media_dim.tweet t
JOIN social_media_dim.engagement e ON t.id_tweet = e.id_tweet
"""
df = pd.read_sql(query, conn)

In [5]:
# Define the engagement calculation function
def calculate_engagement(row):
    """
    Calculate a weighted engagement score for a given row of tweet data.
    
    Parameters:
        row (pd.Series): A row of the DataFrame containing tweet metrics.
    
    Returns:
        float: The weighted engagement score.
    """
    return (
        row['like_count'] * 1 +          # Basic approval (lowest weight)
        row['retweet_count'] * 5 +        # Amplification through sharing
        row['quote_count'] * 10 +         # Highest value - adds commentary & reach
        row['reply_count'] * 3 +          # Direct conversation engagement
        row['view_count'] * 0.001         # Passive impression (minimal weight)
    )

# Apply the function to the DataFrame
df['engagement'] = df.apply(calculate_engagement, axis=1)

In [6]:
df.sample(5)

Unnamed: 0,text,like_count,retweet_count,quote_count,reply_count,view_count,engagement
121,@Pawel_Lachman Mamy kryzys jakiejkolwiek formy...,4,1,0,1,194,12.194
45,"""...nasz inteligentny smog zatrzymuje się na g...",0,0,0,5,22,15.022
43,Trwa masakrowanie Europy #smog-iem przez Belgi...,1,3,1,0,196,26.196
1,Dlaczego auta elektryczne są dobre? Bo poprawi...,47,4,2,18,5262,146.262
53,Nowy Rok. Warszawa. \nDla pasażerów transportu...,138,4,0,14,4210,204.21


In [7]:
from user_code.stopwords import polish_stopwords
print(len(polish_stopwords))

382


In [9]:
# --- Text Preprocessing ---
# Clean the tweet text: lowercase and remove punctuation
df['cleaned_text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# --- Feature Extraction: TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(stop_words=polish_stopwords, max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['engagement']

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.shape[0]

112

In [13]:
X_test.shape[0]

28

In [8]:
# --- Model Training: Regression ---
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- Model Evaluation ---
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# --- Define Engagement Brackets ---
def categorize_engagement(engagement_score):
    if engagement_score < 100:
        return "Low"
    elif 100 <= engagement_score < 500:
        return "Medium"
    else:
        return "High"

# --- Example Prediction ---
sample_tweet = "Krakowska maszyna do produkcji smogu! Szczotki powietrza w akcji."
sample_cleaned = re.sub(r'[^\w\s]', '', sample_tweet.lower())
sample_vector = vectorizer.transform([sample_cleaned])
predicted_engagement = model.predict(sample_vector)[0]
engagement_bracket = categorize_engagement(predicted_engagement)

print("Predicted Engagement:", predicted_engagement)
print("Engagement Bracket:", engagement_bracket)

Mean Squared Error: 315518.88385155675
Predicted Engagement: 4.102489999999996
Engagement Bracket: Low
