In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics

In [3]:
sns.set_theme(context='notebook', style='whitegrid', palette='Set2')

In [4]:
np.random.seed(7)

In [5]:
data = pd.read_csv('Data_final.csv')

In [6]:
y = data.shares
x = data.drop(labels=['shares'], axis =1)

In [7]:
x_train_validation, x_test, y_train_validation, y_test = model_selection.train_test_split(x, y, test_size=0.3, random_state=7)

In [8]:
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(x_train_validation, y_train_validation, train_size=0.7, random_state=7)

In [9]:
scaler = preprocessing.StandardScaler()

In [10]:
scaler.fit(x_train)

In [11]:
x_train=scaler.transform(x_train)
x_validation = scaler.transform(x_validation)
x_train_validation = scaler.transform(x_train_validation)
x_test=scaler.transform(x_test)

In [12]:
#n_neighbors = list(range(5,50)) 
weights = ['uniform', 'distance']
metric = ['minkowski','euclidean','manhattan']

best_mae = 0
best_params = {'n_neighbors' : 4,
              'weights' : ['uniform'],
              'metric' : ['minkowski']}

for i in range(30,80):
    for w in weights:
        for m in metric:
            model = KNeighborsRegressor(n_neighbors=i, weights = w, metric = m)
            model.fit(x_train, y_train)
            y_val_predict = model.predict(x_validation)
            r2 = metrics.r2_score(y_validation, y_val_predict)
            mae = metrics.mean_absolute_error(y_validation, y_val_predict)
            #print(accuracy)
            if mae > best_mae:
                best_score = r2
                best_mae = mae
                best_params['n_neighbors'] = i
                best_params['weights'] = w
                best_params['metric'] = m
           # print(best_params)
print('Best mean absolute error is ', best_mae)
print('Best r2 is ', best_score)
print('Best param is ', best_params)

Best mean absolute error is  444.3705338457441
Best r2 is  0.057131265375782725
Best param is  {'n_neighbors': 41, 'weights': 'uniform', 'metric': 'manhattan'}


In [13]:
final_model_KNN = KNeighborsRegressor(
                n_neighbors= best_params['n_neighbors'], weights=best_params['weights'], metric=best_params['metric'])

In [14]:
final_model_KNN.fit(x_train_validation, y_train_validation)

In [15]:
y_test_predicted_KNN = final_model_KNN.predict(x_test)

In [16]:
r2KNN = metrics.r2_score(y_test, y_test_predicted_KNN)
maeKNN = metrics.mean_absolute_error(y_test, y_test_predicted_KNN)

In [17]:
print('R2 je ', r2KNN)
print('Mean absolute error je ', maeKNN)

R2 je  0.0934776718985093
Mean absolute error je  424.88404638144743


### SVM

In [18]:
Cs = [0.1, 0.3, 0.5, 1, 3, 5, 10, 15, 20]
kernels = ['linear', 'rbf']

best_mae = 0

best_params = {'C' : 0, 
               'kernel' : ['linear']}

for C in Cs :
    for k in kernels:
        model = SVR(C = C, kernel=k)
        model.fit(x_train, y_train)
        y_val_predict = model.predict(x_validation)
        r2 = metrics.r2_score(y_validation, y_val_predict)
        mae = metrics.mean_absolute_error(y_validation, y_val_predict)
        if mae > best_score:
            best_score = r2
            best_mae = mae
            best_params['C'] = C
            best_params['kernel'] = k
            
print('Best mean absolute error is ', best_score)            
print('Best score is ', best_score)
print('Best param is ', best_params)

Best mean absolute error is  0.039482048697578875
Best score is  0.039482048697578875
Best param is  {'C': 20, 'kernel': 'rbf'}


In [19]:
final_model_SVR = SVR(C=best_params['C'], kernel = best_params['kernel'])

In [20]:
final_model_SVR.fit(x_train_validation, y_train_validation)

In [21]:
y_pred_SVR = final_model_SVR.predict(x_test)

In [22]:
r2SVR= metrics.r2_score(y_test, y_pred_SVR)
maeSVR = metrics.mean_absolute_error(y_test, y_pred_SVR)

In [23]:
print('R2 je ', r2SVR)
print('Mean absolute error je ', maeSVR)

R2 je  0.07440829227011192
Mean absolute error je  407.02834327129744
