In [1]:
import pandas as pd
import numpy as np

# dataframe preparation

In [2]:
data=pd.read_csv('data/Dataset/Dataset/Training/Features_Variant_1.csv', header=None).drop_duplicates()

In [3]:
#get and fit features names
features_names=np.array(['page_likes_num',#1
               'page_checkins',#2
               'page_talking_about',#3
                'page_cat',#4
                'page_statistics',#5-29 #mean, avg etc.
                'comments_num_before_base_time',#30
                'comments_num_in_last_24_hours',#31 #last day
                'comments_num_in_last_48_to_24_hours',#32 #day before last
                'comments_num_in_first_24_hours',#33
                'comments_difference_in_last_two_days', #34 (32-31)
                'base_time', #35
                'character_num_in_post', #36
                'share_num',#37
                'post_promotion', #38 binary
                'h_local', #39 This describes the H hrs, for which we have the target variable/ comments received. 
                'post_published_weekday', #40-46 This represents the day(Sunday...Saturday) on which the post was published. 
                'base_ditetime_weekday', #47-53 This represents the day(Sunday...Saturday) on selected base Date/Time. 
                'target' #54 The no of comments in next H hrs(H is given in Feature no 39).                
               ])

for index in range(5,29):
    features_names=np.insert(features_names, index, features_names[4]+'_'+str(index-4))
    
weekday=('sunday', 'monday','tuesday', 'wednesday', 'thursday', 'friday', 'saturday')    

for index in range(40,47):
    features_names=np.insert(features_names,index, features_names[39]+'_'+ weekday[index-40])
features_names=np.delete(features_names, 39)

for index in range(47,54):
    features_names=np.insert(features_names,index, features_names[46]+'_'+ weekday[index-47])
features_names=np.delete(features_names, 46)

data.columns=features_names

# data preparation

In [4]:
#'page_cat' to dummy because it is a category
data=pd.concat([data,pd.get_dummies(data['page_cat'], prefix='page_cat')], axis=1)
data=data.drop(columns=['page_cat'])
#'post_promotion' may be a binary category try to get dummy ?
#df=pd.concat([df,pd.get_dummies(df['post_promotion'], prefix='post_promotion')], axis=1)
#df=df.drop(columns=['post_promotion'])

In [5]:
def normalize(df):
    return (df-np.mean(df))/np.std(df, ddof=1)

# learning

In [6]:
def custom_RMSE(target, pred, gamma, w):
    return np.sqrt(
        np.sum((target-pred)**2)/len(pred)+gamma*np.linalg.norm(w)
    )

def get_para_w(old_w, df, pred, lam, gamma, terms_num):
    indices=np.random.choice(df.shape[1]-1, terms_num, replace=False)
    old_w=old_w+2*lam*(
          (np.dot((df.loc[indices,'target']-pred[indices]).values,df.iloc[indices,:-1].values))/len(pred)
        ), indices
    return old_w

def get_para_w0(old_w0, df, pred, lam, gamma, indices):
    return old_w0+2*lam*(
        (np.sum(df.loc[indices,'target']-pred[indices]))/len(pred)
        )

def gradient_descend(df, lam, gamma, terms_num, max_iter):
    df=df.reset_index(drop=True)
    w=np.array([0]*(df.shape[1]-1)) #one column is the target
    w0=0
    prediction=np.dot(df.iloc[:,:-1],w)+w0
    minRMSE=custom_RMSE(df['target'], prediction, gamma, w)
    params=np.append(w,w0)

    curr_err=10
    err=10e-3
    
    iter_num=0
    while (curr_err>err) & (iter_num<max_iter) :    
        iter_num+=1
        w, indices=get_para_w(w, df, prediction, lam, gamma, terms_num)
        w0=get_para_w0(w0, df, prediction, lam, gamma, indices)
        curr_err=prediction.copy() #old prediction
        prediction=np.dot(df.iloc[:,:-1],w)+w0
        curr_err=np.linalg.norm(curr_err-prediction)#||y(k+1)-y(k)||
        if custom_RMSE(df['target'], prediction, gamma, w)<minRMSE:
            minRMSE=RMSE_arr[-1]
            params=np.append(w,w0)
            best_pred=prediction
    print(iter_num)

    if iter_num==max_iter:
        print('The maximum number of iterations has been reached.')
        
    return params

In [13]:
#set algo params:
lam=2
gamma=2
terms_num=10
max_iter=10e5

# Cross-val

In [8]:
def custom_R2(target, pred):
    return 1-(np.sum((target-pred)**2))/(np.sum((target-np.mean(target))**2))

In [14]:
res=pd.DataFrame()

parts=range(1,6)
idxs=[round(data.shape[0]/5)*(i-1) for i in range(1,6)]
idxs.append(data.shape[0])

for part in parts:
    test_part=data[idxs[part-1]:idxs[part]]
    train_part=data.drop(data.index[idxs[part-1]:idxs[part]])
    #features standartization (xi-mean)/std_err
    train_part.iloc[:,:-1]=normalize(train_part.iloc[:,:-1])
    params=gradient_descend(train_part, lam, gamma, terms_num, max_iter) #model
    test_pred=np.dot(test_part.iloc[:,:-1],params[:-1])+params[-1] #get prediction for test part    
    RMSE=custom_RMSE(test_part['target'], test_pred, gamma, params[:-1])
    r2=custom_R2(test_part['target'], test_pred)
    params=np.insert(params, 0 , [RMSE,r2])
    res[part]=params

1
1
1
1
1


In [15]:
idxs=np.array(['w'+str(i) for i in range(1,res.shape[0]-2)])
idxs=np.append(idxs,'w0')
idxs=np.insert(idxs, 0, ['RMSE', 'r2'])
res.index=idxs

res['mean']=np.mean(res, axis=1)
res['std']=np.std(res.iloc[:,:-1], ddof=1, axis=1)

In [16]:
res

Unnamed: 0,1,2,3,4,5,mean,std
RMSE,40,35,38,36,28,35.4,4.560702
r2,0,0,0,0,0,0.0,0.000000
w1,0,0,0,0,0,0.0,0.000000
w2,0,0,0,0,0,0.0,0.000000
w3,0,0,0,0,0,0.0,0.000000
w4,0,0,0,0,0,0.0,0.000000
w5,0,0,0,0,0,0.0,0.000000
w6,0,0,0,0,0,0.0,0.000000
w7,0,0,0,0,0,0.0,0.000000
w8,0,0,0,0,0,0.0,0.000000


In [12]:
np.sum(res.iloc[2:]['std'])

0.0