In [1]:
import pandas as pd
import numpy as np
import datetime
import sklearn
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from scipy.special import boxcox1p
from scipy.special import inv_boxcox1p
import dask.dataframe as dd

In [2]:
def find_nearest(array, value):
    idx = (np.abs(array - value)).argmin()
    return array[idx]

def data_by_bins(y_test, y_preds, bins=20):   
    y_test = pd.Series(y_test)
    y_preds = pd.Series(y_preds)  
    
    intervals=list()
    intervals.append(-np.inf)   
    
    print("Computing intervals")
    for i in tqdm(range(1, bins)):
        intervals.append(y_preds.quantile(i / bins))        
    intervals.append(np.inf)
    
    interval_data=list()
    print("Collecting intervals data")    
    for i in tqdm(range(len(intervals)-1)):       
        y_preds_interval_index = y_preds[(y_preds >= intervals[i]) & (y_preds <= intervals[i+1])].index 
        interval_data.append(np.array(y_test[y_preds_interval_index]))  
        
    return interval_data,intervals

In [3]:
test_intervals=pd.read_excel("data/Датасеты/test_intervals.xlsx", header=0)
test_intervals.start=pd.to_datetime(test_intervals.start)
test_intervals.finish=pd.to_datetime(test_intervals.finish)

In [4]:
y_fact_test=pd.DataFrame()
y_pred_test=pd.DataFrame()
for i in tqdm(range(4,10)):
    y_p=pd.read_csv(f"pred_data/y_test_exg_{i}.csv")
    y_fact_test=pd.concat([y_fact_test,y_p],axis=0)
    del y_p
    y_t=pd.read_csv(f"pred_data/y_preds_exg_{i}.csv")
    y_pred_test=pd.concat([y_pred_test,y_t],axis=0)
    del y_t
    
y_pred_test["0"]=boxcox1p(inv_boxcox1p(y_pred_test["0"], 0.7)/60, 0.7)
y_fact_test.y=boxcox1p(inv_boxcox1p(y_fact_test.y, 0.7)/60, 0.7)

  0%|          | 0/6 [00:00<?, ?it/s]

In [5]:
interval_data, intervals=data_by_bins(y_fact_test.y, y_pred_test["0"], bins=20) 

Computing intervals


  0%|          | 0/19 [00:00<?, ?it/s]

Collecting intervals data


  0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
def make_intervals():
    w=list()
    for i in range(test_intervals.shape[0]):
        w.append(test_intervals.finish.iloc[i].timestamp()-test_intervals.iloc[i].start.timestamp())
    return test_intervals.start.tolist(),boxcox1p(np.array(w)/60, 0.7)

In [7]:
def find_tm(M3_vector, M3_counts, msg):
    names=M3_vector[M3_vector==2].index
    if len(names)==0:
        tm=M3_counts.nlargest(1).index[0]
    if len(names)==1:
        tm=names[0]
    if len(names)>1:
        u=msg.НАЗВАНИЕ_ТЕХ_МЕСТА.tolist()     
        tm=np.random.choice(u)
    return tm

In [8]:
def find_prob(model_prediction, horizon, intervals, interval_data):  
    for i in range(len(intervals)):
        if model_prediction < intervals[i]:              
            break       
    return (interval_data[i-1] <= horizon).sum() / interval_data[i-1].shape[0]

def calculate_prob_by_interval(interv, horizons, y_preds, M3_preds, M3_counts, msg, interval_data, intervals, exg_number):
    probs=list()
    tms=list()
    print("Computing probabilities exg № ", exg_number)   
    for i in tqdm(range(len(interv))):
        t=interv[i]-d_t
        nearest_t=find_nearest(y_preds.date, t)
        nearest_t_M3=find_nearest(M3_preds.DT, t)  
        prediction=y_preds[f"reg_preds_exg_{exg_number}"][y_preds.date==nearest_t].iloc[0]
        M3_vector=M3_preds[M3_preds.DT==nearest_t_M3].iloc[0]
        tm=find_tm(M3_vector, M3_counts, msg)
        #prediction=y_preds[f"reg_preds_exg_{exg_number}"][w.index]
        prediction_to_minutes=boxcox1p(inv_boxcox1p(prediction, 0.7)/60, 0.7)

        p=find_prob(prediction_to_minutes, horizons[i], intervals, interval_data) 
        #print(p,prediction_to_minutes)  
        probs.append(p)
        tms.append(tm)
    return probs, tms

In [9]:
def calculating_predictions(thresholder, y_preds, M3_preds, M3_counts, msg, exg_number):   
    x_test_intervals, horizons = make_intervals()  
    p_M1, tms = calculate_prob_by_interval(x_test_intervals, horizons, y_preds, M3_preds, M3_counts, msg, interval_data, intervals, exg_number)
    res_predictions=(np.array(p_M1)>thresholder).astype(int)    
    return res_predictions, p_M1, tms

In [10]:
results = []
thresholder=[0.0048,0.005,0.0038,0.0053,0.0053,0.0048]

delta=(test_intervals.finish-test_intervals.start).mean().seconds
delta_minutes=delta/60
d_t=datetime.timedelta(seconds=delta_minutes/2)

messages=pd.read_excel("data/Датасеты/messages.xlsx", header=0)

for p in [4, 5, 6, 7, 8, 9]:
    msg=messages[messages.ИМЯ_МАШИНЫ== f"ЭКСГАУСТЕР А/М №{p}"]
    
    y_preds = pd.read_csv(f"submitions/reg_preds_exg_{p}.csv")
    y_preds.date=pd.to_datetime(y_preds.date)    
    
    y_train = dd.read_parquet('data/Датасеты/y_train.parquet', engine="pyarrow") 
    y_cols=list()
    for i in range(len(y_train.columns)):
        if f'Y_ЭКСГАУСТЕР А/М №{p}' in y_train.columns[i]:
            y_cols.append(y_train.columns[i])        
    M3_counts=y_train[y_cols].sum().compute()
    
    M3_preds = pd.read_csv(f"submitions/submit{p}_M3.csv")
    M3_preds.DT=pd.to_datetime(M3_preds.DT)
    prob_results, probs, tms=calculating_predictions(thresholder[p-4], y_preds, M3_preds, M3_counts, msg, p)

    data = pd.read_excel("data/Датасеты/test_intervals.xlsx", header=0)
    data.drop(["Unnamed: 0"], axis=1, inplace=True)
    data["machine"] = f"ЭКСГАУСТЕР А/М №{p}"
    data["pred_label"]=prob_results
    data["pred_probabilities"]=probs    
    data["tm"]=tms
    for j in range(data.shape[0]):
        if data[f"pred_label"].iloc[j] == 0:
            data[f"tm"].iloc[j]=np.nan
    data.to_csv(f"submitions/results/submition_exg_{p}.csv")

Computing probabilities exg №  4


  0%|          | 0/189 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"tm"].iloc[j]=np.nan


Computing probabilities exg №  5


  0%|          | 0/189 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"tm"].iloc[j]=np.nan


Computing probabilities exg №  6


  0%|          | 0/189 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"tm"].iloc[j]=np.nan


Computing probabilities exg №  7


  0%|          | 0/189 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"tm"].iloc[j]=np.nan


Computing probabilities exg №  8


  0%|          | 0/189 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"tm"].iloc[j]=np.nan


Computing probabilities exg №  9


  0%|          | 0/189 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"tm"].iloc[j]=np.nan


In [None]:
base_sumbit = pd.read_excel("submitions/submission_1.xlsx")
#base_sumbit.drop(["Unnamed: 0", "machine", "tm"], axis=1, inplace=True)

#results.append(data)
#results = pd.concat(results)
# results["start"] = pd.to_datetime(results.start)
# results["finish"] = pd.to_datetime(results.finish)
# results = base_sumbit.merge(results[["start", "finish", "machine"]], how="left", left_on=["start", "finish"], right_on=["start", "finish"])
# results["tm"] = None
# results.to_excel("submission_1.xlsx")

def find_tm_v2(M3_vector, M3_counts):
    names=M3_vector[M3_vector==2].index
    if len(names)==0:
        tm=M3_counts.nlargest(1).index[0]
    if len(names)==1:
        tm=names[0]
    if len(names)>1:
        for i in range(len(M3_counts)):
            if M3_counts.nlargest(M3_counts.shape[0]).index[i] in names:
                break
        tm=M3_counts.nlargest(M3_counts.shape[0]).index[i]
    return tm