In [1]:
import pandas as pd
import numpy as np

In [2]:
#from matplotlib import pyplot as plt

# dataframe preparation

In [3]:
data=pd.read_csv('data/Dataset/Dataset/Training/Features_Variant_1.csv', header=None).drop_duplicates()

In [4]:
print(data.iloc[:,37].unique()) #post promotion has only zero values we can drop this col because it isn't informative

[0]


In [5]:
#get and fit features names
features_names=np.array(['page_likes_num',#1
               'page_checkins',#2
               'page_talking_about',#3
                'page_cat',#4
                'page_statistics',#5-29 #mean, avg etc.
                'comments_num_before_base_time',#30
                'comments_num_in_last_24_hours',#31 #last day
                'comments_num_in_last_48_to_24_hours',#32 #day before last
                'comments_num_in_first_24_hours',#33
                'comments_difference_in_last_two_days', #34 (32-31)
                'base_time', #35
                'character_num_in_post', #36
                'share_num',#37
                'post_promotion', #38 binary
                'h_local', #39 This describes the H hrs, for which we have the target variable/ comments received. 
                'post_published_weekday', #40-46 This represents the day(Sunday...Saturday) on which the post was published. 
                'base_ditetime_weekday', #47-53 This represents the day(Sunday...Saturday) on selected base Date/Time. 
                'target' #54 The no of comments in next H hrs(H is given in Feature no 39).                
               ])

for index in range(5,29):
    features_names=np.insert(features_names, index, features_names[4]+'_'+str(index-4))
    
weekday=('sunday', 'monday','tuesday', 'wednesday', 'thursday', 'friday', 'saturday')    

for index in range(40,47):
    features_names=np.insert(features_names,index, features_names[39]+'_'+ weekday[index-40])
features_names=np.delete(features_names, 39)

for index in range(47,54):
    features_names=np.insert(features_names,index, features_names[46]+'_'+ weekday[index-47])
features_names=np.delete(features_names, 46)

data.columns=features_names

In [6]:
y=data.nunique(axis=0).sort_values()
print(y.head())

post_promotion                      1
post_published_weekday_sunday       2
post_published_weekday_monday       2
post_published_weekday_tuesday      2
post_published_weekday_wednesday    2
dtype: int64


In [7]:
data=data.drop(columns=['post_promotion'])

# data preparation

In [8]:
#'page_cat' to dummy because it is a category
data=pd.concat([data,pd.get_dummies(data['page_cat'], prefix='page_cat')], axis=1)
data=data.drop(columns=['page_cat'])

In [9]:
def normalize(df):
    std=np.std(df, ddof=1).replace(0, 1)
    mean=np.mean(df)
    return (df-mean)/std, mean, std

def normalize_with_params(df, mean, std):
    return (df-mean)/std

# learning

In [10]:
def custom_RMSE(target, pred, gamma, w):
    return np.sqrt(
        np.sum((target-pred)**2)/len(pred)+gamma*np.linalg.norm(w)
    )

def get_para_w(old_w, df, pred, lam, gamma, terms_num, iter_num):#, start_ind):
    indices= np.random.choice(df.shape[1]-1, terms_num, replace=False)
    #indices=np.array(range(start_ind, start_ind+terms_num))%df.shape[0]
    #start_ind=start_ind+terms_num
    old_w=old_w+2*lam*(
          (np.dot((df.loc[indices,'target']-pred[indices]).values,df.iloc[indices,:-1].values))/(len(pred))#*iter_num)
        )
    return old_w, indices#, start_ind

def get_para_w0(old_w0, df, pred, lam, gamma, indices, iter_num):
    return old_w0+2*lam*(
        (np.sum(df.loc[indices,'target']-pred[indices]))/(len(pred))#*iter_num)
        )
def get_prediction(w,w0,x_df):
    return np.dot(x_df, w)+w0

def RMSE_plot(RMSE_arr):
    plt.figure(figsize=(15,7))   
    plt.plot(RMSE_arr)
    plt.xlabel('iteration number')
    plt.ylabel('RMSE value')
    plt.title('algo scores')
    plt.show()
    
def gradient_descend(df, lam, gamma, terms_num, max_iter):
    df=df.reset_index(drop=True)
    w=np.array([0]*(df.shape[1]-1), dtype=float) #one column is the target
    w0=0.0
    prediction=get_prediction(w,w0,df.iloc[:,:-1])
    minRMSE=custom_RMSE(df['target'], prediction, gamma, w)
    #RMSE_arr=np.array([minRMSE], dtype=float)
    params=np.append(w,w0)
    best_params=params
    best_pred=prediction

    curr_err=10
    err=10e-5
    
    iter_num=0
    start_ind=0
    while (curr_err>err) & (iter_num<max_iter) :    
        iter_num+=1
        w, indices=get_para_w(w, df, prediction, lam, gamma, terms_num, iter_num)
        w0=get_para_w0(w0, df, prediction, lam, gamma, indices, iter_num)
        
        #first variant of stopping criterion:
        curr_err=prediction.copy()
        prediction=get_prediction(w,w0,df.iloc[:,:-1])
        curr_err=np.linalg.norm(curr_err-prediction)#||y(k+1)-y(k)||
        
        """
        #another stopping criterion:
        curr_err=params.copy() #old params        
        params=np.append(w,w0)
        curr_err=np.linalg.norm(curr_err-params)#||w(k+1)-w(k)||
        prediction=get_prediction(w,w0,df.iloc[:,:-1])
        """
        params=np.append(w,w0)
        #RMSE_arr=np.append(RMSE_arr,custom_RMSE(df['target'], prediction, gamma, w))
        if custom_RMSE(df['target'], prediction, gamma, w)<minRMSE:
            minRMSE=custom_RMSE(df['target'], prediction, gamma, w)
            best_params=params.copy()
            best_pred=prediction
            
    print('Iterations number is:' +str(iter_num))
    if iter_num==max_iter:
        print('The maximum number of iterations was reached.')
    print('RMSE is:'+str(minRMSE))
    print('R2 is:'+str(custom_R2(df.iloc[:,-1], best_pred)))
    
    #RMSE_plot(RMSE_arr)        
    return best_params

In [11]:
#set algo params:
lam=1e-3
gamma=2.0
terms_num=10
max_iter=1e4

# Cross-val

In [12]:
def custom_R2(target, pred):
    return 1-(np.sum((target-pred)**2))/(np.sum((target-np.mean(target))**2))

In [13]:
res=pd.DataFrame()

parts=range(1,6)
idxs=[round(data.shape[0]/5)*(i-1) for i in range(1,6)]
idxs.append(data.shape[0])

for part in parts:
    #separate dataset
    test_part=data[idxs[part-1]:idxs[part]]
    train_part=data.drop(data.index[idxs[part-1]:idxs[part]])
    #features normalization (xi-mean)/std_err:
    train_part.iloc[:,:-1], mean, std=normalize(train_part.iloc[:,:-1])#fit and transform train
    test_part.iloc[:,:-1]=normalize_with_params(test_part.iloc[:,:-1], mean, std) #transform test
    params=gradient_descend(train_part, lam, gamma, terms_num, max_iter) #model    
    test_pred=get_prediction(params[:-1],params[-1],test_part.iloc[:,:-1])
    RMSE=custom_RMSE(test_part['target'], test_pred, gamma, params[:-1])
    r2=custom_R2(test_part['target'], test_pred)
    params=np.insert(params, 0 , [RMSE,r2])
    res[part]=params

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Iterations number is:1
RMSE is:0.9999847341060957
R2 is:-0.001804612467119382
Iterations number is:10000
The maximum number of iterations was reached.
RMSE is:0.9980356093644175
R2 is:-0.18746654194775791
Iterations number is:10000
The maximum number of iterations was reached.
RMSE is:0.9985549496040738
R2 is:-0.19228291550668297
Iterations number is:6990
RMSE is:0.9991853305540828
R2 is:-0.07970641270116863
Iterations number is:10000
The maximum number of iterations was reached.
RMSE is:0.9988888113738957
R2 is:-inf


  


In [14]:
text_idxs=np.array(['w'+str(i) for i in range(1,res.shape[0]-2)])
text_idxs=np.append(text_idxs,'w0')
text_idxs=np.insert(text_idxs, 0, ['RMSE', 'r2'])
res.index=text_idxs

res['mean']=np.mean(res, axis=1)
res['std']=np.std(res.iloc[:,:-1], ddof=1, axis=1)

In [15]:
res

Unnamed: 0,1,2,3,4,5,mean,std
RMSE,1.158253,0.977361,1.096741,1.005985,0.779647,1.003597,0.144432
r2,-0.005851,0.019595,0.022527,0.015520,-0.066200,-0.002882,0.037106
w1,0.000000,0.000065,0.000065,0.000090,0.000056,0.000055,0.000033
w2,0.000000,0.000157,0.000148,0.000119,0.000139,0.000113,0.000065
w3,0.000000,0.000262,0.000258,0.000222,0.000248,0.000198,0.000112
w4,0.000000,0.000054,0.000045,0.000026,0.000050,0.000035,0.000022
w5,0.000000,0.000481,0.000367,0.000187,-0.000161,0.000175,0.000262
w6,0.000000,0.000351,0.000286,0.000154,0.000378,0.000234,0.000157
w7,0.000000,0.000282,0.000246,0.000140,0.000329,0.000199,0.000131
w8,0.000000,0.000443,0.000341,0.000178,0.000178,0.000228,0.000170


In [16]:
#this part is for comparison
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
#this part is for unnormalized data, next cell contains for normalized data
model=LinearRegression()
scores = cross_val_score(model, data.iloc[:,:-1], data.iloc[:,-1], cv=5, scoring='r2')
print('R2 scores for each fold is:')
print(scores)

R2 scores for each fold is:
[ 0.          0.          0.          0.         -0.00725797]


In [17]:
regres=np.array([])
data=data.reset_index(drop=True)
for part in parts:
    #separate dataset
    test_part=data[idxs[part-1]:idxs[part]]
    train_part=data.drop(data.index[idxs[part-1]:idxs[part]])
    #features normalization (xi-mean)/std_err:
    train_part.iloc[:,:-1], mean, std=normalize(train_part.iloc[:,:-1])#fit and transform train
    test_part.iloc[:,:-1]=normalize_with_params(test_part.iloc[:,:-1], mean, std) #transform test
    model=LinearRegression().fit(train_part.iloc[:,:-1], train_part.iloc[:,-1])    
    test_pred=model.predict(test_part.iloc[:,:-1])    
    r2=custom_R2(test_part['target'], test_pred)
    regres=np.append(regres, r2)
print('R2 for LinReg on 5 folds:')
print(regres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


R2 for LinReg on 5 folds:
[-0.0294288  -0.1127454  -0.04209446 -0.06112038 -0.00179239]


In [18]:
data.head()

Unnamed: 0,page_likes_num,page_checkins,page_talking_about,page_statistics,page_statistics_1,page_statistics_2,page_statistics_3,page_statistics_4,page_statistics_5,page_statistics_6,...,page_cat_89,page_cat_90,page_cat_91,page_cat_92,page_cat_93,page_cat_96,page_cat_100,page_cat_101,page_cat_105,page_cat_106
0,634995,0,463,-0.058081,-0.37518,-0.320153,-0.293864,-0.364892,-0.078782,-0.316374,...,0,0,0,0,0,0,0,0,0,0
1,634995,0,463,-0.058081,-0.37518,-0.320153,-0.293864,-0.364892,-0.078782,-0.316374,...,0,0,0,0,0,0,0,0,0,0
2,634995,0,463,-0.058081,-0.37518,-0.320153,-0.293864,-0.364892,-0.078782,-0.316374,...,0,0,0,0,0,0,0,0,0,0
3,634995,0,463,-0.058081,-0.37518,-0.320153,-0.293864,-0.364892,-0.078782,-0.316374,...,0,0,0,0,0,0,0,0,0,0
4,634995,0,463,-0.058081,-0.37518,-0.320153,-0.293864,-0.364892,-0.078782,-0.316374,...,0,0,0,0,0,0,0,0,0,0
