# Feature Engineering
In this notebook, we present our final feature engienering proposals joining together the notebooks from Joel and Emre. In here, we propose differnt choices where each choice has different combinations of columns from the original database.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

Here we define columns fo rthe dataset

In [2]:
# emre column names
column_names_updated = [
    "Page_Popularity_Likes", "Page_Checkins", "Page_Talking_About", "Page_Category",
    "Derived_1", "Derived_2", "Derived_3", "Derived_4", "Derived_5",
    "Derived_6", "Derived_7", "Derived_8", "Derived_9", "Derived_10",
    "Derived_11", "Derived_12", "Derived_13", "Derived_14", "Derived_15",
    "Derived_16", "Derived_17", "Derived_18", "Derived_19", "Derived_20",
    "Derived_21", "Derived_22", "Derived_23", "Derived_24", "Derived_25",
    "CC1_Total_Comments", "CC2_Comments_Last_24h", "CC3_Comments_48_to_24h",
    "CC4_Comments_First_24h", "CC5_CC2_CC3_Difference", "Base_Time",
    "Post_Length", "Post_Share_Count", "Post_Promotion_Status", "H_Local",
    "Published_Sunday", "Published_Monday", "Published_Tuesday",
    "Published_Wednesday", "Published_Thursday", "Published_Friday",
    "Published_Saturday", "BaseDate_Sunday", "BaseDate_Monday",
    "BaseDate_Tuesday", "BaseDate_Wednesday", "BaseDate_Thursday",
    "BaseDate_Friday", "BaseDate_Saturday", "Target_Comment_Volume"
]

columns = [
    'Page Popularity/likes', 'Page Checkins', 'Page talking about', 'Page Category', 
    'min1', 'min2', 'min3', 'min4', 'min5',
    'max1', 'max2', 'max3', 'max4', 'max5',
    'avg1', 'avg2', 'avg3', 'avg4', 'avg5',
    'median1', 'median2', 'median3', 'median4', 'median5',
    'std1', 'std2', 'std3', 'std4', 'std5',
    'CC1', 'CC2', 'CC3', 'CC4', 'CC5', 
    'Base time', 'Post length', 'Post Share Count', 'Post Promotion Status',
    'H Local', 
    'Post Published Sunday', 'Post Published Monday', 'Post Published Tuesday',  
    'Post Published Wednesday', 'Post Published Thursday', 
    'Post Published Friday', 'Post Published Saturday', 
    'Base DateTime Sunday', 'Base DateTime Monday', 'Base DateTime Tuesday',
    'Base DateTime Wednesday', 'Base DateTime Thursday', 
    'Base DateTime Friday', 'Base DateTime Saturday', 
    'Target Variable'
]
# joel column names

In [4]:
path_to_dataset = "../Dataset/Training/Features_Variant_5.csv"

data = pd.read_csv(path_to_dataset, header=None, names=column_names_updated)
df = pd.read_csv(path_to_dataset, sep=',', header=None, names=columns)

## Proposal 1

In [5]:
published_day_mapping = {
    "Published_Sunday": "Sunday",
    "Published_Monday": "Monday",
    "Published_Tuesday": "Tuesday",
    "Published_Wednesday": "Wednesday",
    "Published_Thursday": "Thursday",
    "Published_Friday": "Friday",
    "Published_Saturday": "Saturday"
}

basedate_day_mapping = {
    "BaseDate_Sunday": "Sunday",
    "BaseDate_Monday": "Monday",
    "BaseDate_Tuesday": "Tuesday",
    "BaseDate_Wednesday": "Wednesday",
    "BaseDate_Thursday": "Thursday",
    "BaseDate_Friday": "Friday",
    "BaseDate_Saturday": "Saturday"
}

data['Published_Day'] = data[[*published_day_mapping.keys()]].idxmax(axis=1).map(published_day_mapping)
data['BaseDate_Day'] = data[[*basedate_day_mapping.keys()]].idxmax(axis=1).map(basedate_day_mapping)
data.drop(columns=list(published_day_mapping.keys()) + list(basedate_day_mapping.keys()), inplace=True)

In [6]:
# Drop Post Promotion Status always 0
data = data.drop("Post_Promotion_Status", axis=1)

In [7]:
# Drop CC1 since it's highly correlated with CC4 also remove its derived features 
data = data.drop("CC1_Total_Comments", axis=1)
data = data.drop("Derived_1", axis=1)
data = data.drop("Derived_2", axis=1)
data = data.drop("Derived_3", axis=1)
data = data.drop("Derived_4", axis=1)
data = data.drop("Derived_5", axis=1)

In [8]:
# We think that min, max and median CCs values are unnecessary
data = data.drop("Derived_6", axis=1)
data = data.drop("Derived_7", axis=1)
data = data.drop("Derived_9", axis=1)
data = data.drop("Derived_11", axis=1)
data = data.drop("Derived_12", axis=1)
data = data.drop("Derived_14", axis=1)
data = data.drop("Derived_16", axis=1)
data = data.drop("Derived_17", axis=1)
data = data.drop("Derived_19", axis=1)
data = data.drop("Derived_21", axis=1)
data = data.drop("Derived_22", axis=1)
data = data.drop("Derived_24", axis=1)

In [9]:
# I think cyclic encoding is more efficient way than one-hot encoding
def encode_day(day_name):
    day_map = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2,
        'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6
    }
    day_idx = day_map.get(day_name, None)
    return (
        np.sin(2 * np.pi * day_idx / 7),
        np.cos(2 * np.pi * day_idx / 7)
    ) 

for col in ['Published_Day', 'BaseDate_Day']:
    data[[f"{col}_Sin", f"{col}_Cos"]] = pd.DataFrame(
        data[col].apply(encode_day).tolist(),
        index=data.index
    )

data.drop(['Published_Day', 'BaseDate_Day'], axis=1, inplace=True)

In [10]:
# Post Length can be treated as a categorical feature and use value encoding
bins = [0, 10, 50, 100, 200, 500, 1000, 5000, np.inf]
labels = [
    '0-10 (micro)', 
    '11-50 (short)', 
    '51-100 (medium)', 
    '101-200 (long)', 
    '201-500 (essay)', 
    '501-1000 (article)', 
    '1001-5000 (long_doc)', 
    '5000+ (extreme)'
]

data['Post_Length_Category'] = pd.cut(
    data['Post_Length'],
    bins=bins,
    labels=labels,
    include_lowest=True
)

ordinal_mapping = {
    '0-10 (micro)': 1,
    '11-50 (short)': 2,
    '51-100 (medium)': 3,
    '101-200 (long)': 4,
    '201-500 (essay)': 5,
    '501-1000 (article)': 6,
    '1001-5000 (long_doc)': 7, 
    '5000+ (extreme)': 8
}
data['Post_Length_Encoded'] = data['Post_Length_Category'].map(ordinal_mapping).astype('int64')

data = data.drop("Post_Length_Category", axis=1)
data = data.drop("Post_Length", axis=1)

In [11]:
# This column calculates the average number of comments per hour in the last 24 hours before the Base Time
data['Average_Comment_Last24h'] = (
    data['CC2_Comments_Last_24h'] / 
    np.maximum(np.minimum(data['Base_Time'], 24), 1) 
)

In [12]:
# The percent_change column quantifies the percentage growth or decline in engagement between two time windows, depending on the post's age (Base_Time)
data['CC3_safe'] = np.maximum(data['CC3_Comments_48_to_24h'], 1) 
data['CC4_safe'] = np.maximum(data['CC4_Comments_First_24h'], 1)

data['engagement_ratio'] = np.where(
    data['Base_Time'] >= 48,
    np.log1p(data['CC2_Comments_Last_24h']) - np.log1p(data['CC3_safe']),  # log(CC2/CC3)
    np.where(
        data['Base_Time'] >= 24,
        np.log1p(data['CC2_Comments_Last_24h']) - np.log1p(data['CC4_safe']),  # log(CC2/CC4)
        0
    )
)
data['percent_change'] = np.expm1(data['engagement_ratio']) * 100

data = data.drop("CC3_safe", axis=1)
data = data.drop("CC4_safe", axis=1)
data = data.drop("engagement_ratio", axis=1)

In [13]:
# I think after these 2 columns we don't need CC5, these columns created are alternative to CC5
data = data.drop("CC5_CC2_CC3_Difference", axis=1)

In [14]:
target_col = data.pop('Target_Comment_Volume')
data['Target_Comment_Volume'] = target_col

In [15]:
data.head()

Unnamed: 0,Page_Popularity_Likes,Page_Checkins,Page_Talking_About,Page_Category,Derived_8,Derived_10,Derived_13,Derived_15,Derived_18,Derived_20,...,Post_Share_Count,H_Local,Published_Day_Sin,Published_Day_Cos,BaseDate_Day_Sin,BaseDate_Day_Cos,Post_Length_Encoded,Average_Comment_Last24h,percent_change,Target_Comment_Volume
0,634995,0,463,1,4.987786,41.250786,5.850382,50.548463,11.080916,75.493579,...,2,24,0.974928,-0.222521,0.433884,-0.900969,4,0.0,-50.0,0
1,634995,0,463,1,4.987786,41.250786,5.850382,50.548463,11.080916,75.493579,...,1,24,0.433884,-0.900969,-0.781831,0.62349,4,0.0,-50.0,0
2,634995,0,463,1,4.987786,41.250786,5.850382,50.548463,11.080916,75.493579,...,2,24,-0.433884,-0.900969,-0.974928,-0.222521,4,0.0,-50.0,0
3,634995,0,463,1,4.987786,41.250786,5.850382,50.548463,11.080916,75.493579,...,1,24,-0.433884,-0.900969,0.0,1.0,4,0.0,-50.0,0
4,634995,0,463,1,4.987786,41.250786,5.850382,50.548463,11.080916,75.493579,...,5,24,0.0,1.0,0.433884,-0.900969,4,0.0,-50.0,0


In [16]:
data.dtypes

Page_Popularity_Likes        int64
Page_Checkins                int64
Page_Talking_About           int64
Page_Category                int64
Derived_8                  float64
Derived_10                 float64
Derived_13                 float64
Derived_15                 float64
Derived_18                 float64
Derived_20                 float64
Derived_23                 float64
Derived_25                 float64
CC2_Comments_Last_24h        int64
CC3_Comments_48_to_24h       int64
CC4_Comments_First_24h       int64
Base_Time                    int64
Post_Share_Count             int64
H_Local                      int64
Published_Day_Sin          float64
Published_Day_Cos          float64
BaseDate_Day_Sin           float64
BaseDate_Day_Cos           float64
Post_Length_Encoded          int64
Average_Comment_Last24h    float64
percent_change             float64
Target_Comment_Volume        int64
dtype: object

## Proposal 2
1. Drop 'Post promotion status' because has only one value
2. Droping min, max, because i think that average and std should be enough. Or can try with median instead of average, if have many outliers.

In [19]:
features1 = df.copy()
features1 = df.drop("Post Promotion Status", axis='columns') # Has only zeros
for i in range(1,6):
    features1 = features1.drop(f'min{i}', axis='columns')
for i in range(1,6):
    features1 = features1.drop(f'max{i}', axis='columns')
for i in range(1,6):
    features1 = features1.drop(f'median{i}', axis='columns')
features1.head()

Unnamed: 0,Page Popularity/likes,Page Checkins,Page talking about,Page Category,avg1,avg2,avg3,avg4,avg5,std1,...,Post Published Friday,Post Published Saturday,Base DateTime Sunday,Base DateTime Monday,Base DateTime Tuesday,Base DateTime Wednesday,Base DateTime Thursday,Base DateTime Friday,Base DateTime Saturday,Target Variable
0,634995,0,463,1,0.0,844.0,5.850382,0.0,50.548463,-505.0,...,0,0,0,0,0,0,1,0,0,0
1,634995,0,463,1,0.0,844.0,5.850382,0.0,50.548463,-505.0,...,0,0,1,0,0,0,0,0,0,0
2,634995,0,463,1,0.0,844.0,5.850382,0.0,50.548463,-505.0,...,1,0,0,0,0,0,0,0,1,0
3,634995,0,463,1,0.0,844.0,5.850382,0.0,50.548463,-505.0,...,1,0,0,1,0,0,0,0,0,0
4,634995,0,463,1,0.0,844.0,5.850382,0.0,50.548463,-505.0,...,0,0,0,0,0,0,1,0,0,0


## Proposal 3
- Here we start with temporal features which are the main features that the model uses
- Then we join the 'days of the week' which is the time the post was published and then 'day of week base' which is the days that were used as based. So, we convert 14 columns into 2
- Create three extra variables based on CC1(2/3/4/5)

In [20]:
# Start with selected original features
temporal_features = [
    'Page Popularity/likes', 'Page talking about', 'Page Category',
    'CC1', 'CC2', 'CC3', 'CC4', 'CC5', 
    'Base time', 'Post length', 'Post Share Count', 'H Local'
]

df2 = df[temporal_features].copy()

# Reduce the days of the weeks and make them categorical: 
df2['comment_velocity'] = df['CC1'] / (df['Base time'] + 1) # What it measures: The average rate of comments per hour since publication.
df2['day_of_week'] = df[['Post Published Sunday', 'Post Published Monday', 'Post Published Tuesday', 
                         'Post Published Wednesday', 'Post Published Thursday', 'Post Published Friday', 
                         'Post Published Saturday']].idxmax(axis=1)
df2['day_of_week_base'] = df[['Base DateTime Sunday', 'Base DateTime Monday', 'Base DateTime Tuesday',
                             'Base DateTime Wednesday', 'Base DateTime Thursday', 'Base DateTime Friday', 
                             'Base DateTime Saturday']].idxmax(axis=1)

df2['comment_momentum'] = df['CC2'] / (df['CC3'] + 1) # The ratio of very recent comments (last 24 hours) to previous period comments (24-48 hours ago)
df2['first_day_ratio'] = df['CC4'] / (df['CC1'] + 1) # What it measures: The proportion of total comments that came in during the first 24 hours after publishing.
df2['decay_factor'] = (df['CC1'] - df['CC4']) / (df['Base time'] - 24 + 1) # What it measures: The rate of new comments after the first 24 hours (comments per hour).


# Convert categorical features to numeric using label encoding
df2['day_of_week'] = df2['day_of_week'].astype('category').cat.codes
df2['day_of_week_base'] = df2['day_of_week_base'].astype('category').cat.codes

df2.head()

Unnamed: 0,Page Popularity/likes,Page talking about,Page Category,CC1,CC2,CC3,CC4,CC5,Base time,Post length,Post Share Count,H Local,comment_velocity,day_of_week,day_of_week_base,comment_momentum,first_day_ratio,decay_factor
0,634995,463,1,0,0,0,0,0,25,166,2,24,0.0,6,4,0.0,0.0,0.0
1,634995,463,1,0,0,0,0,0,70,132,1,24,0.0,4,3,0.0,0.0,0.0
2,634995,463,1,0,0,0,0,0,26,133,2,24,0.0,0,2,0.0,0.0,0.0
3,634995,463,1,7,0,0,7,0,67,131,1,24,0.102941,0,1,0.0,0.875,0.0
4,634995,463,1,1,0,0,1,0,65,142,5,24,0.015152,1,4,0.0,0.5,0.0


## Propose 4
Having all the dataset, except post promotion status which doesn't contain any usefull information

In [22]:
features3 = df.copy()
features3 = df.drop("Post Promotion Status", axis='columns') # Has only zeros
features3.head()

Unnamed: 0,Page Popularity/likes,Page Checkins,Page talking about,Page Category,min1,min2,min3,min4,min5,max1,...,Post Published Friday,Post Published Saturday,Base DateTime Sunday,Base DateTime Monday,Base DateTime Tuesday,Base DateTime Wednesday,Base DateTime Thursday,Base DateTime Friday,Base DateTime Saturday,Target Variable
0,634995,0,463,1,0.0,1280.0,13.158779,1.0,94.99364,0.0,...,0,0,0,0,0,0,1,0,0,0
1,634995,0,463,1,0.0,1280.0,13.158779,1.0,94.99364,0.0,...,0,0,1,0,0,0,0,0,0,0
2,634995,0,463,1,0.0,1280.0,13.158779,1.0,94.99364,0.0,...,1,0,0,0,0,0,0,0,1,0
3,634995,0,463,1,0.0,1280.0,13.158779,1.0,94.99364,0.0,...,1,0,0,1,0,0,0,0,0,0
4,634995,0,463,1,0.0,1280.0,13.158779,1.0,94.99364,0.0,...,0,0,0,0,0,0,1,0,0,0
