In [1]:
import pandas as pd
import numpy as np
import sktime
import seaborn as snsc
import matplotlib.pyplot as plt
from convertcsv.import_preprocess_v4 import readcsvs
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sktime.transformations.panel.rocket import Rocket
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from sktime.classification.kernel_based import RocketClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.impute import KNNImputer
from convertcsv.get_all_metrics_with_tags import get_all_metrics_with_tags
from visualization import graphs
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sktime.datatypes import convert_to
from sktime.datatypes import MTYPE_REGISTER
from collections import Counter
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.classification.compose import ClassifierPipeline


In [2]:
#What should we do here?

#Set up many different pipelines to compare.

#The things to vary, in order of significance/importance
"""
NaN treshold
"""
file_list, y = get_all_metrics_with_tags(r"F:\Master\Kubernetes\sockshop\microservices-demo\query\automated\generated_csvs_4")
initial_readings = readcsvs(file_list, reduce_NaNs_treshold=False, remove_unique_cols=True, remove_monotonic_increasing=True)


In [3]:
#Prior testing revealed optimal KNN imputation
imputer = KNNImputer(n_neighbors=1)
complete_value_set = imputer.fit_transform(initial_readings)
imputed_df = pd.DataFrame(complete_value_set, index=initial_readings.index, columns=initial_readings.columns)

In [4]:
#split into training x and y
X_train, X_test, y_train, y_test = train_test_split(convert_to(imputed_df,to_type="df-list"), y)
#X_train = convert_to(X_train, to_type="pd-multiindex")


Padding/reduction

In [5]:
#Find the length of each individual time series
def trimming(df:pd.DataFrame, y, min_percent=90):
    instances = df.index.get_level_values(0).unique()
    timeSeriesLengths = []
    for instance in instances:
        #This line of code gets every second level index value that corresponds to the current first level index value. 
        second_level_values = df.index.get_level_values(1)[df.index.get_level_values(0)==instance]
        length = len(second_level_values)
        timeSeriesLengths.append(length)
    #Find the most common value.
    counts = Counter(timeSeriesLengths)
    most_common_value, most_common_value_count = counts.most_common(1)[0]
    percentage = (most_common_value_count / len(timeSeriesLengths)) *100

    if percentage >= min_percent:
        indicies_to_drop = []
        for indice in range(len(timeSeriesLengths)):
            if timeSeriesLengths[indice] != most_common_value:
                indicies_to_drop.append(indice)
        filtered_df:pd.DataFrame = df[~df.index.get_level_values(0).isin(indicies_to_drop)]
        #Reset the index
        current_indice = 0
        desired_indice = 0
        newindex_tuples = []
        for index_tuple in filtered_df.index:
            #The thing to watch out for is the fact that we have to keep track of both the 
            #desired index and the index being displayed in the tuple. 
            #When the tuple being looked at changes, the index increases.
            #If two indexes have been removed, the issue becomes keeping track of that. 
            #do a while loop to update it immediately. 
            #nevermind just set it equals, we only care about change anyway
            if(index_tuple[0] != current_indice):
                #this way, when the index being looked at changes, we know.
                current_indice = index_tuple[0]
                #we logged that there is a change so we know its time for next indice
                desired_indice += 1
 
            newindex_tuples.append((desired_indice, index_tuple[1]))
        
        newindex = pd.MultiIndex.from_tuples(newindex_tuples, names=filtered_df.index.names)
        newdf = pd.DataFrame(filtered_df.values, columns=filtered_df.columns, index=newindex)
            
        y = np.array(y)

        #Now rebuild the multiindex to be omontonically increasing.
        
        
        #This still keeps the fucky index in the thing

        return newdf, np.delete(y, indicies_to_drop)

    return ValueError(percentage)
    #To properly preprocess, one should check if the most common value equals the max value. If it does, great.
    #Imagine we find a perfect thing for it. Now we have a couple of datasets consisting of multivariate data.
    #It's important to preserve the general shape of the dataset. So just adding mean to the front and back probably isn't that good. 
    #Can decide to simply purge the ones that aren't of the correct length. To do this, you would have to decide that the most common value is the highest one, and that there are very few time series that differ. Incurs data loss but saves a lot of time.
    #Instead of getting bogged down in a custom optimal solution, get the alright one. For now, get a determiner that purging the ones that are too small is ok. 
    #If ok, proceed. Can then later compare with other algorithms to see if the purging was helpful.
    #Write in the thesis that shit happens and time constraints wcyd

    #For now, determine if purging is ait
    #The most common value should be at least 90% to only tolerate 10% data loss perhaps.

trimmed_df , trimmed_y = trimming(imputed_df, y)


In [6]:
#column in dataframe:
#if column.chaos < treshold:
#   remove column
# 
#Get the average value (after normalization) for each column
#If they are very similar, remove.

In [7]:


converted = convert_to(trimmed_df,to_type="df-list")
X_train, X_test, y_train, y_test = train_test_split(convert_to(trimmed_df,to_type="df-list"),trimmed_y)
X_train_mi = convert_to(X_train, to_type="pd-multiindex")
X_test_mi = convert_to(X_test, to_type='pd-multiindex')

In [8]:
# from sktime.registry import all_tags
# from sktime.registry import all_estimators
# all_estimators("classifier", filter_tags={"capability:multivariate":True})

In [9]:

testscale = StandardScaler()
scaled = testscale.fit_transform(trimmed_df)
scaled = pd.DataFrame(data=scaled, index = X_train_mi.index, columns=X_train_mi.columns)


In [11]:
class Try_Classifiers:
    #Goal: run the various classifiers. Do Normalization/standarization outside of this function.
    #Then in here, only classifiers should be needed. All classifiers implement fit and fit_transform as well as fit_predict.
    #If there are multiple classes that passed in, make the pipeline fit_transform -> fit_predict. Otherwise just fit_predict.
    
    def __init__(self, X_train:pd.DataFrame, y_train=None, X_test=None, y_test=None):
        self.formats = [self.format_dflist, self.format_multiindex, self.format_multiindex_manual, self.format_multiindex_dflist]
        self.X_train_cols = X_train.columns
        self.X_train_ind = X_train.index
        self.X_train = X_train
        if y_train is not None:
            self.y_train = y_train
        if X_test is not None:
            self.X_test = X_test
        if y_test is not None:
            self.y_test = y_test


    def _try_function_with_formats(self, class_function, data):
        for fmt in self.formats:
            try:
                formatted_data = fmt(data)
                result = class_function(formatted_data)
                return result, True
            except Exception as e:
                continue
        return None, False
    
    def format_dflist(self, input):
        return convert_to(input, to_type="df-list")

    def format_multiindex(self, input):
        return convert_to(input, to_type="pd-multiindex")

    def format_multiindex_manual(self, input):
        return pd.DataFrame(input, columns=self.X_train_cols, index=self.X_train_ind)
    
    def format_multiindex_dlflist(self, input):
        return convert_to(pd.DataFrame(input, columns=self.X_train_cols, index=self.X_train_ind), to_type="df-list")

    def run_fit_predict_single(self, class_to_use):
        
        result, completed = self._try_function_with_formats(class_to_use.fit(self.X_train, self.y_train))
        prediction = self._try_function_with_formats(class_to_use.predict(self.X_test))
        
        if completed:
            return result
        return Exception("Something went wrong")


