In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
import numpy as np
import calendar

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("facebook_original.csv")
df = df.drop(['Page total likes','Lifetime Post Total Reach', 'Lifetime Post Total Impressions', 'Lifetime Engaged Users', 
              'Lifetime Post Consumptions', 'Lifetime Post Impressions by people who have liked your Page',
              'Lifetime Post reach by people who like your Page', 'Lifetime People who have liked your Page and engaged with your post',
              'comment', 'like', 'share', 'Total Interactions'], axis = 1)

df.columns = ['type', 'category', 'month', 'weekday', 'hour', 'paid', 'post_consumers']

#df = df.dropna()

category_vals = {1: "action", 2: "product", 3:"inspiration"}
df = df.replace({"category": category_vals})

df['month'] = df['month'].apply(lambda x: calendar.month_abbr[x])

df['weekday'] = df['weekday'].apply(lambda x: calendar.day_name[x-1])
df['hour'] = df['hour'].apply(str)


target_col = 'post_consumers'

feature_names = [col for col in df.columns if col != target_col]

pro_data = df.dropna()

non_num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if pro_data[column].dtypes == "O"]
num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if column not in non_num_cols]

num_data = pro_data[num_cols]
cat_data = pro_data[non_num_cols]

In [2]:
pro_data

Unnamed: 0,type,category,month,weekday,hour,paid,post_consumers
0,Photo,product,Dec,Thursday,3,0.0,109
1,Status,product,Dec,Wednesday,10,0.0,1361
2,Photo,inspiration,Dec,Wednesday,3,0.0,113
3,Photo,product,Dec,Tuesday,10,1.0,790
4,Photo,product,Dec,Tuesday,3,0.0,410
...,...,...,...,...,...,...,...
494,Photo,inspiration,Jan,Sunday,10,0.0,756
495,Photo,inspiration,Jan,Sunday,2,0.0,708
496,Photo,product,Jan,Friday,8,0.0,508
497,Photo,action,Jan,Friday,2,0.0,572


In [3]:
encoder = OneHotEncoder(sparse = True)
enc_data = encoder.fit_transform(cat_data)
enc_df = pd.DataFrame.sparse.from_spmatrix(enc_data, columns = encoder.get_feature_names(cat_data.columns))
enc_df

Unnamed: 0,type_Link,type_Photo,type_Status,type_Video,category_action,category_inspiration,category_product,month_Apr,month_Aug,month_Dec,...,hour_20,hour_22,hour_23,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
495,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
497,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
fin_data = pd.concat([num_data.reset_index(), enc_df.reset_index()], axis = 1, join = 'inner').drop(['index'], axis = 1)
fin_data[target_col] = pro_data[target_col]
fin_data.to_csv("facebook.csv", index = False)

In [5]:
fin_data

Unnamed: 0,paid,type_Link,type_Photo,type_Status,type_Video,category_action,category_inspiration,category_product,month_Apr,month_Aug,...,hour_22,hour_23,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,post_consumers
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,109
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1361
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,113
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,790
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,756
495,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,708
496,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,508
497,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,572


In [6]:
num_cols = None
one_hot_cols = list(fin_data.columns.drop('post_consumers'))

col_dict = {"continuous": num_cols, "discrete": one_hot_cols}

import json

form = json.dumps(col_dict)
f = open("col_dict.json", "w")
f.write(form)
f.close()

with open('col_dict.json') as file:
    col_dict = json.load(file)
file.close()
col_dict

{'continuous': None,
 'discrete': ['paid',
  'type_Link',
  'type_Photo',
  'type_Status',
  'type_Video',
  'category_action',
  'category_inspiration',
  'category_product',
  'month_Apr',
  'month_Aug',
  'month_Dec',
  'month_Feb',
  'month_Jan',
  'month_Jul',
  'month_Jun',
  'month_Mar',
  'month_May',
  'month_Nov',
  'month_Oct',
  'month_Sep',
  'weekday_Friday',
  'weekday_Monday',
  'weekday_Saturday',
  'weekday_Sunday',
  'weekday_Thursday',
  'weekday_Tuesday',
  'weekday_Wednesday',
  'hour_1',
  'hour_10',
  'hour_11',
  'hour_12',
  'hour_13',
  'hour_14',
  'hour_15',
  'hour_16',
  'hour_17',
  'hour_18',
  'hour_19',
  'hour_2',
  'hour_20',
  'hour_22',
  'hour_23',
  'hour_3',
  'hour_4',
  'hour_5',
  'hour_6',
  'hour_7',
  'hour_8',
  'hour_9']}