# Project for exam

## Data import

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../../_dataset/model.csv')

In [3]:
# create a df for categorical variables
df_categorical=df[['day', 'product_category']]
df_categorical.head()

Unnamed: 0,day,product_category
0,tuesday,sport
1,monday,travel
2,thursday,travel
3,tuesday,travel
4,tuesday,tech


In [4]:
# create a df for numerical variables
df_numerical = df.select_dtypes(exclude=['object'])
df_numerical.head()

Unnamed: 0,age_days,n_tokens_title,n_tokens_review,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,436,9,532,0.462264,1.0,0.594427,7,7,1,0,...,0.136364,1.0,-0.180556,-0.25,-0.125,0.5,0.5,0.0,0.5,2200
1,185,16,1012,0.446247,1.0,0.638112,4,2,3,11,...,0.1,0.9,-0.318981,-1.0,-0.05,0.05,-0.075,0.45,0.075,816
2,210,12,506,0.546559,1.0,0.693291,8,3,2,1,...,0.05,0.5,-0.292708,-0.75,-0.1,0.0,0.0,0.5,0.0,1000
3,723,11,241,0.543933,1.0,0.648649,5,4,1,1,...,0.136364,0.7,0.0,0.0,0.0,0.0,0.0,0.5,0.0,660
4,345,11,983,0.40593,1.0,0.594747,11,7,1,1,...,0.1,1.0,-0.256481,-0.8,-0.05,0.0,0.0,0.5,0.0,1000


In [5]:
dummies = pd.get_dummies(df_categorical.astype(str),drop_first=True) 

dummies.tail()

Unnamed: 0,day_monday,day_saturday,day_sunday,day_thursday,day_tuesday,day_wednesday,product_category_cleaning,product_category_entertainment,product_category_other,product_category_sport,product_category_tech,product_category_travel
27995,False,False,False,False,False,True,False,False,True,False,False,False
27996,False,False,False,False,False,True,False,False,False,False,False,False
27997,False,False,False,False,True,False,False,True,False,False,False,False
27998,False,False,False,False,False,False,False,False,False,False,False,True
27999,False,False,False,False,True,False,False,False,False,True,False,False


In [6]:
# create the log of shares
import math
df_numerical['log_shares']=df_numerical[' shares'].apply(lambda x: math.log(x+1))

In [7]:
#remove shares and log of shares
X_numerical=df_numerical.iloc[:,:-2]

In [8]:
print(dummies.shape)
print(X_numerical.shape)

dummies.tail()

(28000, 12)
(28000, 45)


Unnamed: 0,day_monday,day_saturday,day_sunday,day_thursday,day_tuesday,day_wednesday,product_category_cleaning,product_category_entertainment,product_category_other,product_category_sport,product_category_tech,product_category_travel
27995,False,False,False,False,False,True,False,False,True,False,False,False
27996,False,False,False,False,False,True,False,False,False,False,False,False
27997,False,False,False,False,True,False,False,True,False,False,False,False
27998,False,False,False,False,False,False,False,False,False,False,False,True
27999,False,False,False,False,True,False,False,False,False,True,False,False


In [9]:
X=pd.concat([dummies,X_numerical], axis = 1)
X.tail()

Unnamed: 0,day_monday,day_saturday,day_sunday,day_thursday,day_tuesday,day_wednesday,product_category_cleaning,product_category_entertainment,product_category_other,product_category_sport,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
27995,False,False,False,False,False,True,False,False,True,False,...,0.509524,0.2,1.0,-0.333333,-0.333333,-0.333333,0.0,0.0,0.5,0.0
27996,False,False,False,False,False,True,False,False,False,False,...,0.342929,0.1,0.5,-0.622222,-1.0,-0.166667,0.454545,0.136364,0.045455,0.136364
27997,False,False,False,False,True,False,False,True,False,False,...,0.422991,0.1,1.0,-0.404306,-1.0,-0.05,0.0,0.0,0.5,0.0
27998,False,False,False,False,False,False,False,False,False,False,...,0.308378,0.1,0.7,-0.288889,-0.9,-0.1,0.0,0.0,0.5,0.0
27999,False,False,False,False,True,False,False,False,False,True,...,0.318747,0.033333,1.0,-0.12877,-0.1875,-0.071429,0.727273,0.068182,0.227273,0.068182


### Separate Train/Test sets


In [10]:
# log shares
y=df[' shares'].apply(lambda x: math.log(x+1))

In [11]:
from sklearn.model_selection import train_test_split

#SPLIT DATA INTO TRAIN AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size =0.25, #by default is 75%-25%
                                                    random_state= 123) #fix random seed for replicability

print(X_train.shape, X_test.shape)

(21000, 57) (7000, 57)


In [12]:
#standard scaler
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

## Models

In [14]:
from sklearn.model_selection import GridSearchCV
import math

def gs_regression(model, par) :
    gs = GridSearchCV(model, par,cv=3,scoring ='neg_mean_absolute_error') 
    gs = gs.fit(X_train,y_train)

    #summarize the results of our GRIDSEARCH
    print('***GRIDSEARCH RESULTS***')
    print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']
    params = gs.cv_results_['params']
    #for mean, stdev, param in zip(means, stds, params):
    #    print("%f (%f) with: %r" % (mean, stdev, param))
    
    y_pred_train=gs.predict(X_train)
    y_pred_test=gs.predict(X_test) 
    
    y_train_exp=y_train.apply(lambda x: math.exp(x)-1)
    y_test_exp=y_test.apply(lambda x: math.exp(x)-1)
    y_pred_train_exp=np.exp(y_pred_train)-1
    y_pred_test_exp=np.exp(y_pred_test)-1
    
            
    from sklearn import metrics
    print()
    print("MAE  train %.3f (%f)  test %.3f (%f)" % (metrics.mean_absolute_error(y_train, y_pred_train), metrics.mean_absolute_error(y_train_exp, y_pred_train_exp) ,metrics.mean_absolute_error(y_test, y_pred_test),  metrics.mean_absolute_error(y_test_exp, y_pred_test_exp)  ) )
    print("MSE  train %.3f              test %.3f" % (metrics.mean_squared_error(y_train, y_pred_train), metrics.mean_squared_error(y_test, y_pred_test)) ) 
    print("RMSE train %.3f              test %.3f" % (np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)), np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))) ) 
    print("r2   train %.3f              test %.3f" % (metrics.r2_score(y_train, y_pred_train), metrics.r2_score(y_test, y_pred_test)) )

In [15]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# parameters (we ran before a gridsearch to find the best ones) 
#these parameters represent the best ones that we got from our search
param_grid = {
    'learning_rate': [0.05],
    'max_depth': [5],
    'min_child_weight': [3],
    'subsample': [0.7],
    'colsample_bytree': [0.7],
    'n_estimators' : [200]
}

# Instantiate the XGBoost regressor
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# Train the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Make predictions
y_pred = grid_search.predict(X_test)

# Calculate and print the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.7}
Mean Squared Error: 0.7042730284092231
Mean Absolute Error: 0.6118488208658925
R-squared: 0.17807312832215927


In [17]:
df_testset = pd.read_csv('../../_dataset/predictions.csv')

In [18]:
#make dummies of day
day_tmp = pd.get_dummies(df_testset['day'])
day_tmp.columns = 'day_'+day_tmp.columns 

In [19]:
#make dummies of product cateogory
prod_tmp = pd.get_dummies(df_testset['product_category'])
prod_tmp.columns = 'product_category_'+prod_tmp.columns 

In [20]:
X_testset = df_testset.copy()

In [21]:
X_testset = pd.concat([X_testset,prod_tmp,day_tmp],axis=1)

In [22]:
del X_testset['day']
del X_testset['product_category']

In [23]:
cols = X.columns

In [24]:
#ensure our data is inteh same order we need the features to be
X_testset = X_testset[cols]

In [25]:
#run the same scaler transformation that we had before
X_testset = scaler.transform(X_testset)

In [26]:
pd.DataFrame(np.exp(grid_search.predict(X_testset)), index = df_testset.index,columns = ['target']).to_csv('Bulgarelli_Fistani_Sandor.csv')