In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [20]:
data = pd.read_csv("Features_Variant_1.csv", header=None)

The original dataset did not contain column headers. We manually assigned meaningful column names based on the dataset's structure and feature definitions.

In [21]:
column_names_updated = [
    "Page_Popularity_Likes", "Page_Checkins", "Page_Talking_About", "Page_Category",
    "Derived_1", "Derived_2", "Derived_3", "Derived_4", "Derived_5",
    "Derived_6", "Derived_7", "Derived_8", "Derived_9", "Derived_10",
    "Derived_11", "Derived_12", "Derived_13", "Derived_14", "Derived_15",
    "Derived_16", "Derived_17", "Derived_18", "Derived_19", "Derived_20",
    "Derived_21", "Derived_22", "Derived_23", "Derived_24", "Derived_25",
    "CC1_Total_Comments", "CC2_Comments_Last_24h", "CC3_Comments_48_to_24h",
    "CC4_Comments_First_24h", "CC5_CC2_CC3_Difference", "Base_Time",
    "Post_Length", "Post_Share_Count", "Post_Promotion_Status", "H_Local",
    "Published_Sunday", "Published_Monday", "Published_Tuesday",
    "Published_Wednesday", "Published_Thursday", "Published_Friday",
    "Published_Saturday", "BaseDate_Sunday", "BaseDate_Monday",
    "BaseDate_Tuesday", "BaseDate_Wednesday", "BaseDate_Thursday",
    "BaseDate_Friday", "BaseDate_Saturday", "Target_Comment_Volume"
]

data.columns = column_names_updated

Map page category IDs to readable labels, then one-hot encode the column for modeling.


In [22]:
category_mapping = {
    1: "Product/service", 2: "Public figure", 3: "Retail and consumer merchandise",
    4: "Athlete", 5: "Education website", 6: "Arts/entertainment/nightlife",
    7: "Aerospace/defense", 8: "Actor/director", 9: "Professional sports team",
    10: "Travel/leisure", 11: "Arts/humanities website", 12: "Food/beverages",
    13: "Record label", 14: "Movie", 15: "Song", 16: "Community",
    17: "Company", 18: "Artist", 19: "Non-governmental organization (NGO)",
    20: "Media/news/publishing", 21: "Cars", 22: "Clothing", 23: "Local business",
    24: "Musician/band", 25: "Politician", 26: "News/media website",
    27: "Education", 28: "Author", 29: "Sports event", 30: "Restaurant/cafe",
    31: "School sports team", 32: "University", 33: "TV show",
    34: "Website", 35: "Outdoor gear/sporting goods", 36: "Political party",
    37: "Sports league", 38: "Entertainer", 39: "Church/religious organization",
    40: "Non-profit organization", 41: "Automobiles and parts", 42: "TV channel",
    43: "Telecommunication", 44: "Entertainment website", 45: "Shopping/retail",
    46: "Personal blog", 47: "App page", 48: "Vitamins/supplements",
    49: "Professional services", 50: "Movie theater", 51: "Software",
    52: "Magazine", 53: "Electronics", 54: "School", 55: "Just for fun",
    56: "Club", 57: "Comedian", 58: "Sports venue", 59: "Sports/recreation/activities",
    60: "Publisher", 61: "TV network", 62: "Health/medical/pharmacy",
    63: "Studio", 64: "Home decor", 65: "Jewelry/watches", 66: "Writer",
    67: "Health/beauty", 68: "Music video", 69: "Appliances", 70: "Computers/technology",
    71: "Insurance company", 72: "Music award", 73: "Recreation/sports website",
    74: "Reference website", 75: "Business/economy website", 76: "Bar",
    77: "Album", 78: "Games/toys", 79: "Camera/photo", 80: "Book",
    81: "Producer", 82: "Landmark", 83: "Cause", 84: "Organization",
    85: "TV/movie award", 86: "Hotel", 87: "Health/medical/pharmaceuticals",
    88: "Transportation", 89: "Local/travel website", 90: "Musical instrument",
    91: "Radio station", 92: "Other", 93: "Computers", 94: "Phone/tablet",
    95: "Coach", 96: "Tools/equipment", 97: "Internet/software",
    98: "Bank/financial institution", 99: "Society/culture website",
    100: "Small business", 101: "News personality", 102: "Teens/kids website",
    103: "Government official", 104: "Photographer", 105: "Spas/beauty/personal care",
    106: "Video game"
}

data["Page_Category"] = data["Page_Category"].map(category_mapping)
data = pd.get_dummies(data, columns=['Page_Category'], drop_first=True)

Drop 'Post Promotion Status' since it is always 0


In [23]:
data = data.drop("Post_Promotion_Status", axis=1)

We replaced one-hot encoding for the day of the week (both Published_Day and BaseDate_Day) with cyclical encoding using sine and cosine transformations. This approach is more efficient and captures the natural periodicity of the days — for example, it correctly treats Sunday and Monday as being next to each other. Unlike one-hot encoding, cyclical encoding maintains the relative distance and circular continuity between days of the week, which can lead to better performance in models that assume numeric relationships between features.

In [24]:
published_day_mapping = {
    "Published_Sunday": "Sunday",
    "Published_Monday": "Monday",
    "Published_Tuesday": "Tuesday",
    "Published_Wednesday": "Wednesday",
    "Published_Thursday": "Thursday",
    "Published_Friday": "Friday",
    "Published_Saturday": "Saturday"
}

basedate_day_mapping = {
    "BaseDate_Sunday": "Sunday",
    "BaseDate_Monday": "Monday",
    "BaseDate_Tuesday": "Tuesday",
    "BaseDate_Wednesday": "Wednesday",
    "BaseDate_Thursday": "Thursday",
    "BaseDate_Friday": "Friday",
    "BaseDate_Saturday": "Saturday"
}

data['Published_Day'] = data[[*published_day_mapping.keys()]].idxmax(axis=1).map(published_day_mapping)
data['BaseDate_Day'] = data[[*basedate_day_mapping.keys()]].idxmax(axis=1).map(basedate_day_mapping)
data.drop(columns=list(published_day_mapping.keys()) + list(basedate_day_mapping.keys()), inplace=True)

# I think cyclic encoding is more efficient way than one-hot encoding
def encode_day(day_name):
    day_map = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2,
        'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6
    }
    day_idx = day_map.get(day_name, None)
    return (
        np.sin(2 * np.pi * day_idx / 7),
        np.cos(2 * np.pi * day_idx / 7)
    ) 

for col in ['Published_Day', 'BaseDate_Day']:
    data[[f"{col}_Sin", f"{col}_Cos"]] = pd.DataFrame(
        data[col].apply(encode_day).tolist(),
        index=data.index
    )

data.drop(['Published_Day', 'BaseDate_Day'], axis=1, inplace=True)

In [25]:
# Move 'Target_Comment_Volume' column to the end of the DataFrame
target_col = data.pop("Target_Comment_Volume")
data["Target_Comment_Volume"] = target_col
data.head()

Unnamed: 0,Page_Popularity_Likes,Page_Checkins,Page_Talking_About,Derived_1,Derived_2,Derived_3,Derived_4,Derived_5,Derived_6,Derived_7,Derived_8,Derived_9,Derived_10,Derived_11,Derived_12,Derived_13,Derived_14,Derived_15,Derived_16,Derived_17,Derived_18,Derived_19,Derived_20,Derived_21,Derived_22,Derived_23,Derived_24,Derived_25,CC1_Total_Comments,CC2_Comments_Last_24h,CC3_Comments_48_to_24h,CC4_Comments_First_24h,CC5_CC2_CC3_Difference,Base_Time,Post_Length,Post_Share_Count,H_Local,Page_Category_Album,Page_Category_App page,Page_Category_Artist,Page_Category_Arts/entertainment/nightlife,Page_Category_Arts/humanities website,Page_Category_Athlete,Page_Category_Author,Page_Category_Bar,Page_Category_Book,Page_Category_Business/economy website,Page_Category_Camera/photo,Page_Category_Cars,Page_Category_Cause,Page_Category_Church/religious organization,Page_Category_Clothing,Page_Category_Club,Page_Category_Comedian,Page_Category_Community,Page_Category_Company,Page_Category_Computers,Page_Category_Education,Page_Category_Education website,Page_Category_Entertainer,Page_Category_Entertainment website,Page_Category_Food/beverages,Page_Category_Health/beauty,Page_Category_Health/medical/pharmaceuticals,Page_Category_Health/medical/pharmacy,Page_Category_Just for fun,Page_Category_Landmark,Page_Category_Local business,Page_Category_Local/travel website,Page_Category_Media/news/publishing,Page_Category_Movie,Page_Category_Movie theater,Page_Category_Music award,Page_Category_Music video,Page_Category_Musical instrument,Page_Category_Musician/band,Page_Category_News personality,Page_Category_News/media website,Page_Category_Non-governmental organization (NGO),Page_Category_Non-profit organization,Page_Category_Other,Page_Category_Outdoor gear/sporting goods,Page_Category_Personal blog,Page_Category_Political party,Page_Category_Politician,Page_Category_Producer,Page_Category_Product/service,Page_Category_Professional services,Page_Category_Professional sports team,Page_Category_Public figure,Page_Category_Publisher,Page_Category_Radio station,Page_Category_Record label,Page_Category_Recreation/sports website,Page_Category_Restaurant/cafe,Page_Category_Retail and consumer merchandise,Page_Category_School,Page_Category_School sports team,Page_Category_Shopping/retail,Page_Category_Small business,Page_Category_Software,Page_Category_Song,Page_Category_Spas/beauty/personal care,Page_Category_Sports event,Page_Category_Sports venue,Page_Category_Sports/recreation/activities,Page_Category_Studio,Page_Category_TV channel,Page_Category_TV network,Page_Category_TV show,Page_Category_TV/movie award,Page_Category_Tools/equipment,Page_Category_Travel/leisure,Page_Category_University,Page_Category_Video game,Page_Category_Website,Page_Category_Writer,Published_Day_Sin,Published_Day_Cos,BaseDate_Day_Sin,BaseDate_Day_Cos,Target_Comment_Volume
0,634995,0,463,0.0,806.0,11.291045,1.0,70.495138,0.0,806.0,7.574627,0.0,69.435826,0.0,76.0,2.604478,0.0,8.505502,0.0,806.0,10.649254,1.0,70.254788,-69.0,806.0,4.970149,0.0,69.85058,0,0,0,0,0,65,166,2,24,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.974928,-0.222521,-0.974928,-0.222521,0
1,634995,0,463,0.0,806.0,11.291045,1.0,70.495138,0.0,806.0,7.574627,0.0,69.435826,0.0,76.0,2.604478,0.0,8.505502,0.0,806.0,10.649254,1.0,70.254788,-69.0,806.0,4.970149,0.0,69.85058,0,0,0,0,0,10,132,1,24,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.433884,-0.900969,-0.433884,-0.900969,0
2,634995,0,463,0.0,806.0,11.291045,1.0,70.495138,0.0,806.0,7.574627,0.0,69.435826,0.0,76.0,2.604478,0.0,8.505502,0.0,806.0,10.649254,1.0,70.254788,-69.0,806.0,4.970149,0.0,69.85058,0,0,0,0,0,14,133,2,24,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.433884,-0.900969,-0.974928,-0.222521,0
3,634995,0,463,0.0,806.0,11.291045,1.0,70.495138,0.0,806.0,7.574627,0.0,69.435826,0.0,76.0,2.604478,0.0,8.505502,0.0,806.0,10.649254,1.0,70.254788,-69.0,806.0,4.970149,0.0,69.85058,7,0,3,7,-3,62,131,1,24,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.433884,-0.900969,0.0,1.0,0
4,634995,0,463,0.0,806.0,11.291045,1.0,70.495138,0.0,806.0,7.574627,0.0,69.435826,0.0,76.0,2.604478,0.0,8.505502,0.0,806.0,10.649254,1.0,70.254788,-69.0,806.0,4.970149,0.0,69.85058,1,0,0,1,0,58,142,5,24,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,1.0,0.974928,-0.222521,0


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Columns: 122 entries, Page_Popularity_Likes to Target_Comment_Volume
dtypes: bool(80), float64(29), int64(13)
memory usage: 16.2 MB


In [26]:
data.describe()

Unnamed: 0,Page_Popularity_Likes,Page_Checkins,Page_Talking_About,Derived_1,Derived_2,Derived_3,Derived_4,Derived_5,Derived_6,Derived_7,Derived_8,Derived_9,Derived_10,Derived_11,Derived_12,Derived_13,Derived_14,Derived_15,Derived_16,Derived_17,Derived_18,Derived_19,Derived_20,Derived_21,Derived_22,Derived_23,Derived_24,Derived_25,CC1_Total_Comments,CC2_Comments_Last_24h,CC3_Comments_48_to_24h,CC4_Comments_First_24h,CC5_CC2_CC3_Difference,Base_Time,Post_Length,Post_Share_Count,H_Local,Published_Day_Sin,Published_Day_Cos,BaseDate_Day_Sin,BaseDate_Day_Cos,Target_Comment_Volume
count,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0
mean,1313814.0,4676.133752,44800.25,1.586241,443.333854,55.720384,35.645535,67.464151,0.219468,285.187428,22.186647,7.503724,40.474206,0.024103,268.358275,19.649686,4.921537,38.728848,1.497253,415.395297,52.631591,34.04237,63.154906,-220.046619,275.403722,2.536961,-2.020904,55.840996,55.720384,22.186647,19.649686,52.631591,2.536961,35.322035,163.65247,117.249823,23.767833,0.040547,-0.014017,0.003507,-0.024221,7.322889
std,6785752.0,20593.184863,110933.8,20.753174,496.695198,86.933548,69.960232,81.568249,10.055146,374.441728,36.930662,21.778756,54.277774,1.98136,327.063844,31.094112,13.245799,50.846434,18.715475,472.380251,81.264281,66.153081,76.403985,281.814185,373.330611,17.544907,14.720873,73.811134,136.975705,77.124263,71.078576,128.17992,94.202974,20.916864,376.264387,945.006667,1.919829,0.706509,0.706419,0.710738,0.703048,35.49455
min,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1366.0,-204.0,-210.5,-288.0,0.0,0.0,0.0,0.0,0.0,-1366.0,0.0,0.0,1.0,1.0,-0.974928,-0.900969,-0.974928,-0.900969,0.0
25%,36734.0,0.0,698.0,0.0,45.0,5.527273,2.0,8.278756,0.0,26.0,1.91129,0.0,4.109465,0.0,26.0,2.030303,0.0,4.09458,0.0,42.0,5.218182,2.0,7.600215,-310.0,23.0,-0.483221,-2.0,5.99095,2.0,0.0,0.0,2.0,-6.0,17.0,38.0,2.0,24.0,-0.781831,-0.900969,-0.781831,-0.900969,0.0
50%,292911.0,0.0,7045.0,0.0,241.0,23.374101,12.0,35.06914,0.0,118.0,8.4375,2.0,17.382709,0.0,116.0,8.58427,1.0,18.639984,0.0,224.0,21.859375,12.0,32.368848,-92.0,109.0,0.27381,0.0,25.547172,11.0,2.0,0.0,10.0,0.0,35.0,97.0,13.0,24.0,0.0,-0.222521,0.0,-0.222521,0.0
75%,1204214.0,99.0,50264.0,0.0,717.0,71.828829,42.0,102.554954,0.0,405.0,29.005525,8.0,60.760334,0.0,381.0,24.84252,5.0,54.523165,0.0,676.0,67.913793,40.0,96.266919,-21.0,379.0,2.974684,0.0,81.209289,46.0,12.0,9.0,44.0,3.0,53.0,172.0,61.0,24.0,0.781831,0.62349,0.781831,0.62349,3.0
max,486972300.0,186370.0,6089942.0,2341.0,2341.0,2341.0,2341.0,731.394558,1923.0,2079.0,1923.0,1923.0,469.538781,324.0,1605.0,437.684211,433.0,533.638557,1923.0,2184.0,1923.0,1923.0,703.14405,1923.0,2079.0,1923.0,1923.0,749.7096,2341.0,2079.0,1605.0,2184.0,2079.0,72.0,21480.0,144860.0,24.0,0.974928,1.0,0.974928,1.0,1305.0


In [30]:
data.isnull().sum()

Page_Popularity_Likes                                0
Page_Checkins                                        0
Page_Talking_About                                   0
Derived_1                                            0
Derived_2                                            0
Derived_3                                            0
Derived_4                                            0
Derived_5                                            0
Derived_6                                            0
Derived_7                                            0
Derived_8                                            0
Derived_9                                            0
Derived_10                                           0
Derived_11                                           0
Derived_12                                           0
Derived_13                                           0
Derived_14                                           0
Derived_15                                           0
Derived_16

In [27]:
data.to_csv("Preprocessed_Data.csv", index=False)