# Feature selection

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/content/drive/My Drive/dataset_without_outliers_grouped')

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('total week diff', axis=1)
y = df['total week diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

In [None]:
from sklearn.feature_selection import VarianceThreshold

selection = VarianceThreshold(0.05)
selection.fit(X)
selection.get_support()

In [None]:
X_reduced = X.loc[:,selection.get_support()]

# Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()

In [None]:
model_lr.fit(X_train, y_train)

In [None]:
pred = model_lr.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, explained_variance_score

In [None]:
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)

In [None]:
print('''
MAE: {}
MSE: {}
RMSE: {}
'''.format(mae,mse,rmse))

# Regression decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

model_tree = DecisionTreeRegressor()

rgr = model_tree.fit(X_train, y_train)

y_pred_tree = model_tree.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred_tree)
mse = mean_squared_error(y_test, y_pred_tree)
rmse = np.sqrt(mse)

In [None]:
print('''
MAE: {}
MSE: {}
RMSE: {}
'''.format(mae,mse,rmse))

# XGBoost

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size = 0.3, random_state=0)


dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'max_depth': 5,
    'learning_rate': 0.2,
    'n_estimators': 1000
}


model = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dval, 'validation')], early_stopping_rounds=10)

y_pred = model.predict(dval)

y_val = dval.get_label()

mae = mean_absolute_error(y_val, y_pred)
print("Mean Absolute Error:", mae)


Parameters: { "n_estimators" } are not used.

[0]	validation-mae:2624.19307
[1]	validation-mae:2335.42290
[2]	validation-mae:2135.68818
[3]	validation-mae:1975.57343
[4]	validation-mae:1876.65101
[5]	validation-mae:1786.56701
[6]	validation-mae:1720.96662
[7]	validation-mae:1671.43441
[8]	validation-mae:1618.03391
[9]	validation-mae:1575.65076
[10]	validation-mae:1548.29610
[11]	validation-mae:1520.43206
[12]	validation-mae:1495.67489
[13]	validation-mae:1474.17856
[14]	validation-mae:1465.38787
[15]	validation-mae:1453.45035
[16]	validation-mae:1440.74925
[17]	validation-mae:1425.08891
[18]	validation-mae:1416.41204
[19]	validation-mae:1402.96304
[20]	validation-mae:1396.43277
[21]	validation-mae:1388.21244
[22]	validation-mae:1386.76309
[23]	validation-mae:1378.91076
[24]	validation-mae:1375.77220
[25]	validation-mae:1374.37890
[26]	validation-mae:1368.65564
[27]	validation-mae:1367.02099
[28]	validation-mae:1362.89099
[29]	validation-mae:1361.02267
[30]	validation-mae:1356.39463
[31

In [None]:
best_iteration = model.best_iteration

In [None]:
model = xgb.train(params, dtrain, num_boost_round=best_iteration)

Parameters: { "n_estimators" } are not used.



In [None]:
y_pred = model.predict(dval)

y_val = dval.get_label()

mae = mean_absolute_error(y_val, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 1211.985


# Classification decision tree


In [None]:
type(y[0])

numpy.float64

In [None]:
y

0         3200.0
1         2640.0
2         3000.0
3         1000.0
4         1600.0
           ...  
200050   -3900.0
200051     190.0
200052     500.0
200053     300.0
200054    1192.0
Name: total week diff, Length: 200055, dtype: float64

In [None]:
y = y.astype('float')

In [None]:
bins = [-np.inf,-5000, -1000, 0, 1000, 5000, np.inf]
labels = ['manje od -5000', '-1000 do -5000',
          '0 do -1000', '0 do 1000', '1000 do 5000',
          'preko 5000']

categorized_y = pd.cut(y, bins=bins, labels=labels)

In [None]:
categorized_y.value_counts()

0 do 1000         88221
1000 do 5000      56863
preko 5000        23000
-1000 do -5000    11959
0 do -1000        10984
manje od -5000     9028
Name: total week diff, dtype: int64

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier

model_tree = DecisionTreeClassifier()
model_lr = LogisticRegression(solver='liblinear')

model = VotingClassifier(voting='hard', estimators=[('tree', model_tree), ('lr', model_lr)])

for model, label in zip([model_tree, model_lr, model], ['Tree', 'LR', 'Voting']):
    scores = cross_val_score(model, X_reduced, categorized_y, cv=10, scoring='accuracy')
    print(f'{label}: {scores.mean()}')

Tree: 0.7296331183124443
LR: 0.5383412779215473
Voting: 0.5257388576978519


# SVM

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [None]:
y_train.shape

(140038,)

In [None]:
y_train = np.array(y_train).reshape(len(y_train),1)
X_train = np.array(X_train)
y_test = np.array(y_test).reshape(len(y_test),1)
X_test = np.array(X_test)

In [None]:
sc_X_train = StandardScaler()
sc_y_train = StandardScaler()

sc_X_test = StandardScaler()
sc_y_test = StandardScaler()

X_train = sc_X_train.fit_transform(X_train)
y_train = sc_y_train.fit_transform(y_train)

X_test = sc_X_test.fit_transform(X_test)
y_test = sc_y_test.fit_transform(y_test)

In [None]:
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train,y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)

Mean Squared Error: 0.5713721176226559
Mean Absolute Error: 0.2411497351595937
R^2 Score: 0.42862788237734384


# Neural networks

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size
= 0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import tensorflow as tf

model.compile(loss='mean_absolute_error', optimizer=tf.keras.optimizers.Adam(0.001))
early_stopping = EarlyStopping(patience=10, verbose=1)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=50, batch_size=64, callbacks=[early_stopping])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, explained_variance_score

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
58
r2 = r2_score(y_test, y_pred)
print('''
MAE: {}
MSE: {}
RMSE: {}
R2: {}
'''.format(mae,mse,rmse, r2))

# Random forest


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


forest = RandomForestRegressor(n_jobs=None, random_state=42)
parameters = {'n_estimators': [10, 20],'max_features': [1, 2, 4], 'min_samples_leaf': [3, 5, 7, 9],'max_depth': [5,10,15]}
grid = GridSearchCV(forest, parameters, cv=5, scoring='accuracy')
grid.fit(X,y)
print('Best param: ', grid.best_params_)

In [None]:
forest = RandomForestRegressor(n_estimators=10, n_jobs=None, random_state=42, max_depth=5, max_features=1, min_samples_leaf=3)

In [None]:
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, explained_variance_score
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('''
MAE: {}
MSE: {}
RMSE: {}
'''.format(mae,mse,rmse))