In [21]:
import sys
sys.path.append(r"E:\CMF\CMF_FFS/")

from src.metrics_calculation import metrics
from src.cross_val import timeseriesCVscore
from src.preprocess import preprocessing

import numpy as np
import pandas as pd
from os import listdir
from sklearn.linear_model import Lasso
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [22]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
data_dir = '../data/processed/'

## Searching for feature importance

In [24]:
y_list = []
pred_list = []
coef_list = []
for i in listdir(data_dir):
    data = pd.read_csv(data_dir + i)
    data['Equity Charge'] = [i for i in range(len(data['Equity Charge']))]
    y = data['Revenue']
    data.drop(columns=['Revenue'], inplace=True)
    x_train, x_test, y_train, y_test = train_test_split(data, y, train_size=38, shuffle=False)
    
    model = Lasso()

    model.fit(x_train, y_train)
    coef_list.append(model.coef_)
    predictions = model.predict(x_test)
    y_list.append(y_test)
    pred_list.append(predictions)
    
m1 = metrics(y_list, pred_list, one_model=False)
print('Mean MAPE =', m1[0])
print('Mean WAPE =', m1[1])
print('Mean MSE =', m1[2])

Mean MAPE = 0.48520016269903776
Mean WAPE = 0.4831421455045152
Mean MSE = 9631563.724151878


## Getting top 10 features

In [25]:
features = dict.fromkeys(range(73),0)
for i in coef_list:
    for j in np.where(i>0)[0]:
        features[j] += 1

In [26]:
feature_importance = {k: v for k, v in sorted(features.items(), key=lambda item: item[1])}

In [27]:
ind = list(feature_importance.keys())[-10:]

## Fitting with top 10 features

In [18]:
y_list = []
pred_list = []
for i in listdir(data_dir):
    data = pd.read_csv(data_dir + i)
    data['Equity Charge'] = [i for i in range(len(data['Equity Charge']))]
    y = data['Revenue']
    data.drop(columns=['Revenue'], inplace=True)
    x_train, x_test, y_train, y_test = train_test_split(data, y, train_size=38, shuffle=False)
    
    x_train = x_train.iloc[:, ind]
    x_test = x_test.iloc[:, ind]
    
    model = Lasso()

    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    y_list.append(y_test)
    pred_list.append(predictions)

m2 = metrics(y_list, pred_list, one_model=False)
print('Mean MAPE =', m2[0])
print('Mean WAPE =', m2[1])
print('Mean MSE =', m2[2])

Mean MAPE = 0.1977819346298819
Mean WAPE = 0.19342538901126866
Mean MSE = 68719.20589049453


## Fitting with top 10 features with cross validation

In [19]:
# Appending target and data
ind.append(3)
ind.append(19)

In [20]:
metr = np.array([])
names = []

for i in listdir(data_dir):
    data = pd.read_csv(data_dir + i)
    data = data.iloc[:, ind]
    mape, wape, mse = timeseriesCVscore(data, Lasso())
    metr = np.append(metr, [[mape], [wape], [mse]])
    names.append(i[:-4])
print('Mean MAPE =', metr[::3].sum() / len(metr[::3]) )
print('Mean WAPE =', metr[1::3].sum() / len(metr[1::3]) )
print('Mean MSE =', metr[2::3].sum() / len(metr[2::3]) )


Mean MAPE = 2308.8127816471942
Mean WAPE = 2131.3417438042975
Mean MSE = 1923885438186651.0
