In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [17]:
data = pd.read_csv("Features_Variant_1.csv", header=None)

In [18]:
column_names_updated = [
    "Page_Popularity_Likes", "Page_Checkins", "Page_Talking_About", "Page_Category",
    "Derived_1", "Derived_2", "Derived_3", "Derived_4", "Derived_5",
    "Derived_6", "Derived_7", "Derived_8", "Derived_9", "Derived_10",
    "Derived_11", "Derived_12", "Derived_13", "Derived_14", "Derived_15",
    "Derived_16", "Derived_17", "Derived_18", "Derived_19", "Derived_20",
    "Derived_21", "Derived_22", "Derived_23", "Derived_24", "Derived_25",
    "CC1_Total_Comments", "CC2_Comments_Last_24h", "CC3_Comments_48_to_24h",
    "CC4_Comments_First_24h", "CC5_CC2_CC3_Difference", "Base_Time",
    "Post_Length", "Post_Share_Count", "Post_Promotion_Status", "H_Local",
    "Published_Sunday", "Published_Monday", "Published_Tuesday",
    "Published_Wednesday", "Published_Thursday", "Published_Friday",
    "Published_Saturday", "BaseDate_Sunday", "BaseDate_Monday",
    "BaseDate_Tuesday", "BaseDate_Wednesday", "BaseDate_Thursday",
    "BaseDate_Friday", "BaseDate_Saturday", "Target_Comment_Volume"
]

data.columns = column_names_updated

In [19]:
# Drop Post Promotion Status always 0
data = data.drop("Post_Promotion_Status", axis=1)
data = data.drop("CC1_Total_Comments", axis=1)

In [20]:
published_day_mapping = {
    "Published_Sunday": "Sunday",
    "Published_Monday": "Monday",
    "Published_Tuesday": "Tuesday",
    "Published_Wednesday": "Wednesday",
    "Published_Thursday": "Thursday",
    "Published_Friday": "Friday",
    "Published_Saturday": "Saturday"
}

basedate_day_mapping = {
    "BaseDate_Sunday": "Sunday",
    "BaseDate_Monday": "Monday",
    "BaseDate_Tuesday": "Tuesday",
    "BaseDate_Wednesday": "Wednesday",
    "BaseDate_Thursday": "Thursday",
    "BaseDate_Friday": "Friday",
    "BaseDate_Saturday": "Saturday"
}

data['Published_Day'] = data[[*published_day_mapping.keys()]].idxmax(axis=1).map(published_day_mapping)
data['BaseDate_Day'] = data[[*basedate_day_mapping.keys()]].idxmax(axis=1).map(basedate_day_mapping)
data.drop(columns=list(published_day_mapping.keys()) + list(basedate_day_mapping.keys()), inplace=True)

# I think cyclic encoding is more efficient way than one-hot encoding
def encode_day(day_name):
    day_map = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2,
        'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6
    }
    day_idx = day_map.get(day_name, None)
    return (
        np.sin(2 * np.pi * day_idx / 7),
        np.cos(2 * np.pi * day_idx / 7)
    ) 

for col in ['Published_Day', 'BaseDate_Day']:
    data[[f"{col}_Sin", f"{col}_Cos"]] = pd.DataFrame(
        data[col].apply(encode_day).tolist(),
        index=data.index
    )

data.drop(['Published_Day', 'BaseDate_Day'], axis=1, inplace=True)

In [21]:
# This column calculates the average number of comments per hour in the last 24 hours before the Base Time
data['Average_Comment_Last24h'] = (
    data['CC2_Comments_Last_24h'] / 
    np.maximum(np.minimum(data['Base_Time'], 24), 1) 
)

In [22]:
data.to_pickle('raw_features.pkl')