In [1]:
%%capture --no-stdout
import os
from scipy.stats import ks_2samp

from utils import *
from preprocessing import *
from models import *
import warnings
warnings.filterwarnings('ignore')

matplotlib.style.use('ggplot')
%matplotlib inline

In [2]:
%%capture --no-stdout
df = load_frame_for_prediction('reno')
df_time = load_date_frame('reno')

Data loaded
Data loaded


In [3]:
df_time.drop(['WOCATEGORY', 'start', 'finish'], axis=1, inplace=True)

In [4]:
data = df.merge(df_time, on="WORKORDERKEY").set_index("WORKORDERKEY")

In [5]:
data_list = remove_outliers(data)

Outliers represents 28.5% of the dataset
Outliers represents 36.5% of the dataset
Outliers represents 19.6% of the dataset
Outliers represents 19.4% of the dataset


In [6]:
def get_time_outliers(data):
    outliers=set()

    #drop when length_of_time_in_days is very low
    min_= data.length_of_time_in_days.quantile(0.03)
    outliers.update(data[data.length_of_time_in_days<min_].index)
    
    # With 2IQR rule, high drop length_of_time_in_days values 
    max_ = get_iqr_bound(data.length_of_time_in_days)
    outliers.update(data[data.length_of_time_in_days>max_].index)
    
    print(min_, max_)
    print('Outliers represents {}% of the dataset'.format(np.round(100*len(outliers)/len(data),1)))
    return outliers

def remove_time_outliers(d_list):
    res = []
    for d in d_list:
        res.append(d.drop(get_time_outliers(d)))
    return res

In [7]:
clean_list = remove_time_outliers(data_list)

1.0 176.0
Outliers represents 4.1% of the dataset
1.0 26.0
Outliers represents 12.1% of the dataset
4.0 85.0
Outliers represents 6.7% of the dataset
2.0 35.75
Outliers represents 8.5% of the dataset


# Models

### Metric = R^2

In [8]:
features_0, _, _, _ = prep_data_for_modelisation(clean_list[0])

In [9]:
target_0 = features_0['length_of_time_in_days']
features_0.drop("length_of_time_in_days", axis=1, inplace=True)

In [10]:
# Lasso Model
warnings.filterwarnings('ignore')
pg = {'alpha' : np.round(np.logspace(start=0, stop = 5, num = 20), 4)}
gs = cross_validate_model(data=features_0,target=target_0,
                     model='lasso', param_grid=pg, scale=True, scoring=None)

# Ridge Model
pg = {'alpha' : np.round(np.logspace(start=-2, stop = 5, num = 20), 4)}
gs = cross_validate_model(data=features_0,target=target_0,
                     model='ridge', param_grid=pg, scale=True, scoring=None)

############# lasso ################
Best parameters : {'alpha': 1.833}
Scores 
Train: 0.30918322410247956 % (+/- 0.060963910522547946)
Test: -5.2644603793563105 % (+/- 12.506466416425484)

############# ridge ################
Best parameters : {'alpha': 112.8838}
Scores 
Train: 0.25809570084288774 % (+/- 0.03880692566512761)
Test: -5.845080109852946 % (+/- 14.56929599718506)



# Feature importances & selection

I drop features where more than 80% of values are 0.

I then select the 20 best with Lasso (or less if less than 20 features has non null coefficients)

In [11]:
best_feat = features_0[select_lasso_k_best(features_0, target_0)]

In [12]:
# Lasso Model
warnings.filterwarnings('ignore')
pg = {'alpha' : np.round(np.logspace(start=0, stop = 2, num = 20), 4)}
gs = cross_validate_model(data=best_feat,target=target_0,
                     model='lasso', param_grid=pg, scale=True, scoring=None)

# Ridge Model
pg = {'alpha' : np.round(np.logspace(start=-2, stop = 2, num = 20), 4)}
gs = cross_validate_model(data=best_feat,target=target_0,
                     model='ridge', param_grid=pg, scale=True, scoring=None)

############# lasso ################
Best parameters : {'alpha': 1.0}
Scores 
Train: 0.3215407811143033 % (+/- 0.060445737857488635)
Test: -3.938938454848508 % (+/- 9.508608528605043)

############# ridge ################
Best parameters : {'alpha': 14.3845}
Scores 
Train: 0.318668166488461 % (+/- 0.05913109030240433)
Test: -3.5628174750698447 % (+/- 8.770232941109859)



# Results

In [13]:
scores=[]
for data,name in zip(clean_list, ['C1', 'C2', 'B3', 'B4']):
    s = get_time_result(data)
    scores.append(pd.Series(s, name=name))

In [14]:
time_results = pd.concat(scores,axis=1)

In [15]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [16]:
time_results.style.apply(highlight_max)

Unnamed: 0,C1,C2,B3,B4
Lasso,-5.41308,-0.0960799,0.325065,0.153888
Lasso with feature selection,-4.05638,-0.00212882,0.499175,0.181765
Ridge,-5.84508,-0.0273525,0.235864,0.131524
Ridge with feature selection,-3.59369,0.0143582,0.514698,0.202618


In [17]:
os.makedirs('scores', exist_ok=True)
time_results.to_csv(os.path.join('scores','time_results_wilmington.csv'))

In [18]:
# Lasso Model
warnings.filterwarnings('ignore')
pg = {'alpha' : np.round(np.logspace(start=0, stop = 5, num = 20), 4)}
gs = cross_validate_model(data=features[col],target=target,
                     model='lasso', param_grid=pg, scale=True, scoring=None)

# Ridge Model
pg = {'alpha' : np.round(np.logspace(start=-2, stop = 5, num = 20), 4)}
gs = cross_validate_model(data=features[col],target=target,
                     model='ridge', param_grid=pg, scale=True, scoring=None)

NameError: name 'features' is not defined

In [None]:
col=['TOTALLABORESTIMATEDCOST']