# Data upload

In [0]:

#!pip install googledrivedownloader
!mkdir data_files

import os.path
os.chdir('/content/data_files')

from google_drive_downloader import GoogleDriveDownloader as gdd
gdd.download_file_from_google_drive(file_id='1Vy6G1IFiWdTGoHo7YaNVYw7v9Xw3XnuT',
                                    dest_path='/content/data_files/Features_Variant_1.zip',
                                    unzip=True)


Downloading 1Vy6G1IFiWdTGoHo7YaNVYw7v9Xw3XnuT into /content/data_files/Features_Variant_1.zip... Done.
Unzipping...Done.


# DataFrame Preparation

In [0]:
import pandas as pd
import numpy as np
import math

data=pd.read_csv('/content/data_files/Features_Variant_1.csv', header=None).drop_duplicates()

In [0]:
#get and fit features names
features_names=np.array(['page_likes_num',#1
               'page_checkins',#2
               'page_talking_about',#3
                'page_cat',#4
                'page_statistics',#5-29 #mean, avg etc.
                'comments_num_before_base_time',#30
                'comments_num_in_last_24_hours',#31 #last day
                'comments_num_in_last_48_to_24_hours',#32 #day before last
                'comments_num_in_first_24_hours',#33
                'comments_difference_in_last_two_days', #34 (32-31)
                'base_time', #35
                'character_num_in_post', #36
                'share_num',#37
                'post_promotion', #38 binary
                'h_local', #39 This describes the H hrs, for which we have the target variable/ comments received. 
                'post_published_weekday', #40-46 This represents the day(Sunday...Saturday) on which the post was published. 
                'base_ditetime_weekday', #47-53 This represents the day(Sunday...Saturday) on selected base Date/Time. 
                'target' #54 The no of comments in next H hrs(H is given in Feature no 39).                
               ])

for index in range(5,29):
    features_names=np.insert(features_names, index, features_names[4]+'_'+str(index-4))
    
weekday=('sunday', 'monday','tuesday', 'wednesday', 'thursday', 'friday', 'saturday')    

for index in range(40,47):
    features_names=np.insert(features_names,index, features_names[39]+'_'+ weekday[index-40])
features_names=np.delete(features_names, 39)

for index in range(47,54):
    features_names=np.insert(features_names,index, features_names[46]+'_'+ weekday[index-47])
features_names=np.delete(features_names, 46)

data.columns=features_names

In [0]:
y=data.nunique(axis=0).sort_values()
print(y.head())
data=data.drop(columns=['post_promotion']) #post promotion has only zero values we can drop this col because it isn't informative

post_promotion                      1
post_published_weekday_sunday       2
post_published_weekday_monday       2
post_published_weekday_tuesday      2
post_published_weekday_wednesday    2
dtype: int64


# Data preparation

In [0]:
#'page_cat' to dummy because it is a category
#data=pd.concat([data,pd.get_dummies(data['page_cat'], prefix='page_cat')], axis=1)
#data=data.drop(columns=['page_cat'])

In [0]:
#data shuffling
from sklearn.utils import shuffle
data=shuffle(data).reset_index(drop=True)

In [0]:
def normalize(df):
    std=np.std(df).replace(0, 1)
    mean=np.mean(df)
    return (df-mean)/std, mean, std

def normalize_with_params(df, mean, std):
    return (df-mean)/std

# learning

In [0]:
def custom_RMSE(target, pred, gamma, w):
    return np.sqrt(
        np.sum((target-pred)**2)/len(pred)+gamma*np.linalg.norm(w)
    )

def get_para_w(old_w, df, pred, lam, gamma, batch_counter,terms_num): 
    #indices=np.random.choice(df.shape[0]-1,terms_num,replace=False)
    old_w=old_w+2*lam*(
          (np.dot(
              (
                  df.iloc[batch_counter*terms_num:(batch_counter+1)*terms_num,
                         -1]-pred[batch_counter*terms_num:(batch_counter+1)*terms_num]).values,
                  df.iloc[batch_counter*terms_num:(batch_counter+1)*terms_num,:-1].values))/len(pred)
        )
    return old_w

def get_para_w0(old_w0, df, pred, lam, gamma, batch_counter,terms_num):
    return old_w0+2*lam*(
        (np.sum(df.iloc[batch_counter*terms_num:(batch_counter+1)*terms_num,
                       -1]-pred[batch_counter*terms_num:(batch_counter+1)*terms_num]))/len(pred))

def get_prediction(w,w0,x_df):
    return np.dot(x_df, w)+w0
    
def gradient_descend(df, lam, gamma, terms_num, max_iter):
    df=df.reset_index(drop=True)
    w=np.random.rand(df.shape[1]-1)#one column is the target
    w0=0.0
    prediction=get_prediction(w,w0,df.iloc[:,:-1])
    minRMSE=custom_RMSE(df['target'], prediction, gamma, w)

    best_params=np.append(w,w0)
    best_pred=prediction

    curr_err=10
    err=1e-4
    
    iter_num=0
    while (curr_err>err) & (iter_num<max_iter) :    
        iter_num+=1
        df=shuffle(df)#.reset_index(drop=True)
        prediction=get_prediction(w,w0,df.iloc[:,:-1])
        curr_err=prediction.copy()
        
        for batch_counter in range(0, math.ceil(df.shape[0]/terms_num)):
            #indices=np.array(df.iloc[batch_counter*terms_num:(batch_counter+1)*terms_num].index)            
            w=get_para_w(w, df, prediction, lam, gamma, batch_counter, terms_num)
            w0=get_para_w0(w0, df, prediction, lam, gamma, batch_counter, terms_num)            
            prediction=get_prediction(w,w0,df.iloc[:,:-1])

            if custom_RMSE(df['target'], prediction, gamma, w)<minRMSE:
                minRMSE=custom_RMSE(df['target'], prediction, gamma, w)
                best_params=np.append(w,w0)
                best_pred=prediction
        curr_err=np.linalg.norm(curr_err-prediction)
            
    print('Iterations number is:' +str(iter_num))
    if iter_num==max_iter:
        print('The maximum number of iterations was reached.')
    print('RMSE is:'+str(minRMSE))
    print('R2 is:'+str(custom_R2(df.iloc[:,-1], best_pred)))
    
    return best_params

In [0]:
#set algo params:
lam=1e-4
gamma=0 #now we needn't it
terms_num=100
max_iter=50

# cross-val

In [0]:
def custom_R2(target, pred):
    return 1-(np.sum((target-pred)**2))/(np.sum((target-np.mean(target))**2))

In [0]:
data.iloc[:,:-1], mean, std =normalize(data.iloc[:,:-1])

res=pd.DataFrame()

parts=range(1,6)
idxs=[round(data.shape[0]/5)*(i-1) for i in range(1,6)]
idxs.append(data.shape[0])

for part in parts:
    #separate dataset
    test_part=data[idxs[part-1]:idxs[part]]
    train_part=data.drop(data.index[idxs[part-1]:idxs[part]])

    #features normalization (xi-mean)/std_err:
    #train_part.iloc[:,:-1], mean, std=normalize(train_part.iloc[:,:-1])#fit and transform train
    #test_part.iloc[:,:-1]=normalize_with_params(test_part.iloc[:,:-1], mean, std) #transform test
    
    params=gradient_descend(train_part, lam, gamma, terms_num, max_iter) #model    
    test_pred=get_prediction(params[:-1],params[-1],test_part.iloc[:,:-1])
    RMSE=custom_RMSE(test_part['target'], test_pred, gamma, params[:-1])
    r2=custom_R2(test_part['target'], test_pred)
    params=np.insert(params, 0 , [RMSE,r2])
    res[part]=params

Iterations number is:50
The maximum number of iterations was reached.
RMSE is:32.806873808890316
R2 is:0.10684338012401384
Iterations number is:50
The maximum number of iterations was reached.
RMSE is:33.46414017423727
R2 is:0.08920418485216919
Iterations number is:50
The maximum number of iterations was reached.
RMSE is:34.10810225861931
R2 is:0.09499597547507233
Iterations number is:50
The maximum number of iterations was reached.
RMSE is:34.394837767726926
R2 is:0.09025211747828188
Iterations number is:50
The maximum number of iterations was reached.
RMSE is:34.24798782592698
R2 is:0.08358066142511944


In [0]:
text_idxs=np.array(['w'+str(i) for i in range(1,res.shape[0]-2)])
text_idxs=np.append(text_idxs,'w0')
text_idxs=np.insert(text_idxs, 0, ['RMSE', 'r2'])
res.index=text_idxs

res['mean']=np.mean(res, axis=1)
res['std']=np.std(res.iloc[:,:-1], axis=1)

In [25]:
res.iloc[:2]

Unnamed: 0,1,2,3,4,5,mean,std
RMS,36.969736,35.699868,32.172388,31.030538,32.713166,33.717139,2.241546
r2,0.07645,0.077958,0.106228,0.123734,0.093658,0.095605,0.017811


In [26]:
res

Unnamed: 0,1,2,3,4,5,mean,std
RMS,36.969736,35.699868,32.172388,31.030538,32.713166,33.717139,2.241546
r2,0.07645,0.077958,0.106228,0.123734,0.093658,0.095605,0.017811
w1,0.938446,0.325477,0.339117,0.061975,0.310461,0.395095,0.29029
w2,0.354638,0.273066,0.548759,0.592264,0.954975,0.54474,0.23686
w3,0.529024,0.425892,0.174176,0.619457,0.411838,0.432077,0.149312
w4,0.385641,0.625691,0.361679,0.427055,0.050249,0.370063,0.185034
w5,0.081383,0.124455,0.553152,0.906826,0.754582,0.48408,0.331104
w6,0.232897,0.419218,0.104257,0.602522,0.484099,0.368598,0.178278
w7,0.242248,0.308165,0.189814,0.042077,0.611025,0.278666,0.187898
w8,0.836711,0.823026,0.233715,0.765148,0.404579,0.612636,0.246821
