In [3]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler


In [4]:
df = pd.read_csv("dataset_Facebook.csv", sep=';')
df.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393


In [5]:
# dropping comments, likes and share column
df.drop(df.iloc[:, 7:18], inplace = True, axis = 1)
df

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Total Interactions
0,139441,Photo,2,12,4,3,0.0,100
1,139441,Status,2,12,3,10,0.0,164
2,139441,Photo,3,12,3,3,0.0,80
3,139441,Photo,2,12,2,10,1.0,1777
4,139441,Photo,2,12,2,3,0.0,393
...,...,...,...,...,...,...,...,...
495,85093,Photo,3,1,7,2,0.0,84
496,81370,Photo,2,1,5,8,0.0,75
497,81370,Photo,1,1,5,2,0.0,115
498,81370,Photo,3,1,4,11,0.0,136


In [6]:
# There is one NaN value in Paid column and I will fill it with 0
df['Paid'] = df['Paid'].fillna(0)
df.isnull().any()

Page total likes      False
Type                  False
Category              False
Post Month            False
Post Weekday          False
Post Hour             False
Paid                  False
Total Interactions    False
dtype: bool

In [7]:
# function to convert week days into integers from 1 to 7
def convert_weekdays(x):
    if x == 1:
        return 'Sunday'
    elif x == 2:
        return 'Monday'
    elif x == 3:
        return 'Tuesday'
    elif x == 4:
        return 'Wednesday'
    elif x == 5:
        return 'Thursday'
    elif x == 6:
        return 'Friday'
    elif x == 7:
        return 'Saturday'

df['Weekday'] = df['Post Weekday'].apply(lambda x: convert_weekdays(x))

In [8]:
dayDf = pd.get_dummies(df['Weekday'])
dayDf

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
495,0,0,1,0,0,0,0
496,0,0,0,0,1,0,0
497,0,0,0,0,1,0,0
498,0,0,0,0,0,0,1


In [9]:
df = pd.concat([df, dayDf], axis = 1)

In [10]:
hours = list(range(0, 18))
for i in hours:
    hours[i] = str(hours[i])
    hours[i] = 'Hour_'+ hours[i]

In [12]:
hourDf = pd.get_dummies(df['Post Hour'], prefix='hr_')
df = pd.concat([df, hourDf], axis=1)
monthDf = pd.get_dummies(df['Post Month'], prefix='Mo')
df = pd.concat([df, monthDf], axis = 1)
df['Video'] = pd.get_dummies(df['Type'])['Video']
df['Status'] = pd.get_dummies(df['Type'])['Status']
df['Photo'] = pd.get_dummies(df['Type'])['Photo']
df['Category_1'] = pd.get_dummies(df['Category'])[1]
df['Category_2'] = pd.get_dummies(df['Category'])[2]
df['Category_3'] = pd.get_dummies(df['Category'])[3]
df

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Total Interactions,Weekday,Friday,...,Mo_6,Mo_7,Mo_8,Mo_9,Mo_10,Mo_11,Mo_12,Category_1,Category_2,Category_3
0,139441,Photo,2,12,4,3,0.0,100,Wednesday,0,...,0,0,0,0,0,0,1,0,1,0
1,139441,Status,2,12,3,10,0.0,164,Tuesday,0,...,0,0,0,0,0,0,1,0,1,0
2,139441,Photo,3,12,3,3,0.0,80,Tuesday,0,...,0,0,0,0,0,0,1,0,0,1
3,139441,Photo,2,12,2,10,1.0,1777,Monday,0,...,0,0,0,0,0,0,1,0,1,0
4,139441,Photo,2,12,2,3,0.0,393,Monday,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,85093,Photo,3,1,7,2,0.0,84,Saturday,0,...,0,0,0,0,0,0,0,0,0,1
496,81370,Photo,2,1,5,8,0.0,75,Thursday,0,...,0,0,0,0,0,0,0,0,1,0
497,81370,Photo,1,1,5,2,0.0,115,Thursday,0,...,0,0,0,0,0,0,0,1,0,0
498,81370,Photo,3,1,4,11,0.0,136,Wednesday,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# Drop Type, Post Month, Post Weekday and Post hour columns from the dataframe
df.drop(['Category', 'Type', 'Post Month', 'Post Hour', 'Post Weekday', 'Weekday'], axis=1, inplace = True)
df

Unnamed: 0,Page total likes,Paid,Total Interactions,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,...,Mo_6,Mo_7,Mo_8,Mo_9,Mo_10,Mo_11,Mo_12,Category_1,Category_2,Category_3
0,139441,0.0,100,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1,139441,0.0,164,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
2,139441,0.0,80,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,139441,1.0,1777,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,139441,0.0,393,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,85093,0.0,84,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
496,81370,0.0,75,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
497,81370,0.0,115,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
498,81370,0.0,136,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [17]:
# getting the edge value of total interactions
outlier = np.percentile(df['Total Interactions'], 90)
outlier

409.1

In [18]:
df = df[df['Total Interactions']<outlier]

In [19]:
scaler = StandardScaler()
likes = df['Page total likes']
likes = likes.values.reshape(-1, 1)
scaler.fit(likes)
df['Page total likes'] = scaler.transform(likes)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Page total likes'] = scaler.transform(likes)


Unnamed: 0,Page total likes,Paid,Total Interactions,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,...,Mo_6,Mo_7,Mo_8,Mo_9,Mo_10,Mo_11,Mo_12,Category_1,Category_2,Category_3
0,1.013546,0.0,100,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1,1.013546,0.0,164,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
2,1.013546,0.0,80,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,1.013546,0.0,393,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
5,1.013546,0.0,186,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-2.288209,0.0,84,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
496,-2.514389,0.0,75,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
497,-2.514389,0.0,115,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
498,-2.514389,0.0,136,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


Data splitting into Train and Test.
I will use 80% for training and 20% for testing

In [20]:
df.columns

Index(['Page total likes', 'Paid', 'Total Interactions', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'hr__1',
       'hr__2', 'hr__3', 'hr__4', 'hr__5', 'hr__6', 'hr__7', 'hr__8', 'hr__9',
       'hr__10', 'hr__11', 'hr__12', 'hr__13', 'hr__14', 'hr__15', 'hr__16',
       'hr__17', 'hr__18', 'hr__19', 'hr__20', 'hr__22', 'hr__23', 'Mo_1',
       'Mo_2', 'Mo_3', 'Mo_4', 'Mo_5', 'Mo_6', 'Mo_7', 'Mo_8', 'Mo_9', 'Mo_10',
       'Mo_11', 'Mo_12', 'Video', 'Status', 'Photo', 'hr__1', 'hr__2', 'hr__3',
       'hr__4', 'hr__5', 'hr__6', 'hr__7', 'hr__8', 'hr__9', 'hr__10',
       'hr__11', 'hr__12', 'hr__13', 'hr__14', 'hr__15', 'hr__16', 'hr__17',
       'hr__18', 'hr__19', 'hr__20', 'hr__22', 'hr__23', 'Mo_1', 'Mo_2',
       'Mo_3', 'Mo_4', 'Mo_5', 'Mo_6', 'Mo_7', 'Mo_8', 'Mo_9', 'Mo_10',
       'Mo_11', 'Mo_12', 'Category_1', 'Category_2', 'Category_3'],
      dtype='object')

In [22]:
x = df.drop(['Total Interactions'], axis=1).values
y = df['Total Interactions'].values

In [24]:
x

array([[ 1.01354643,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.01354643,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.01354643,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-2.51438869,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-2.51438869,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-2.51438869,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [25]:
y

array([100, 164,  80, 393, 186, 279, 339, 192, 142, 252, 106, 104, 152,
       108,  54,  75,  42,  81,   0,  18,  96, 121, 106, 117, 174, 162,
        39, 204, 125,  75, 199, 113, 100, 269,  59, 262, 211,  36,  46,
        58,  71, 214,  32,  92,  14,  58, 337, 242,  87,  83,  81, 220,
        53,  90, 160, 123, 162, 202, 245,  69, 119,  55,  70, 119, 162,
       273, 233, 114,   0,  97,  64,  38, 101, 252, 122, 117, 158, 163,
       311,  39, 126,  91, 342, 129, 148, 185,  65, 142, 378,   0,  79,
        17,  72, 222, 238, 148, 271,  34,   0, 139,  97,  17,   6,   7,
         9,   8,   2, 232,   4,   6,   7,   9, 242,  10,   5,   3,  10,
        10,  10,  12,  34,  68,  48,  39, 154, 100, 202, 265,  53,  96,
       204,  94, 179,  35, 367, 200, 193, 387,  70, 163, 137, 252,  98,
       409, 282, 288,  18, 132,  84, 121, 217,  38, 228, 225, 243, 189,
       276, 299, 194, 199, 114, 271,  90,  72, 263,  67, 117, 164, 119,
        70, 133, 330,  73,  60,  32, 166,  45,  70, 243, 132,  4