In [None]:
import pandas as pd
import pyarrow as pa
import numpy as np
import gc
import pyarrow.parquet as pq
import missingno as msn

import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import math

from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

from plotly.subplots import make_subplots

In [None]:
X_train = pq.ParquetFile("X_train.parquet")

In [None]:
y_train = pq.ParquetFile("y_train.parquet")

In [None]:
y_train = y_train.read_row_group(0)

In [None]:
end_frame = []
for column in y_train.column_names:
    if column != 'DT':
        end_frame.append(y_train[column].to_pandas().astype("category"))
    else:
        end_frame.append(y_train[column].to_pandas())
del y_train
y_train = pd.concat(end_frame, axis=1)
del end_frame
gc.collect()

In [None]:
y_train.info()

In [None]:
table = X_train.read_row_group(0)

In [None]:
end_frame = []
for column in table.column_names:
    if column != 'DT':
        end_frame.append(table[column].to_pandas().astype("float32"))
    else:
        end_frame.append(table[column].to_pandas())
del table
df = pd.concat(end_frame, axis=1)
del end_frame
gc.collect()

In [None]:
df.head()

In [None]:
fig = msn.bar(df)

In [None]:
ex4_columns = [column for column in df.columns if column.find("ЭКСГАУСТЕР 4") != -1]
ex4_columns

In [None]:
[column for column in y_train.columns]

In [None]:
ex4y_columns = [column for column in y_train.columns if column.find("Y_ЭКСГАУСТЕР А/М №4") != -1]
ex4y_columns

In [None]:
machinery = {}
for column in y_train.columns.values:
    machinery[column[:19]] = machinery.get(column[:19], 0) + 1
machinery

In [None]:
machinery = {}
for column in df.columns.values:
    machinery[column[:12]] = machinery.get(column[:12], 0) + 1
machinery

In [None]:
ex4_columns.insert(0, 'DT')
ex4y_columns.insert(0, 'DT')

In [None]:
msn.bar(df[ex4_columns])

In [None]:
df[ex4_columns]

In [None]:
num_columns = len(y_train[ex4y_columns].columns)
num_rows = math.ceil(math.sqrt(num_columns))
num_cols = math.ceil(num_columns / num_rows)

# Create subplots with calculated rows and columns
fig = make_subplots(rows=num_rows, cols=num_cols)
fig.update_layout(title_font=dict(size=5))

# Iterate over each column and add bar plots to the respective subplot
for i, column in enumerate(y_train[ex4y_columns].columns):
    # Count the occurrences of each class label in the current column
    class_counts = y_train[column].value_counts()

    # Sort the class labels and their counts in descending order
    sorted_classes = class_counts.index.tolist()
    sorted_counts = class_counts.values.tolist()

    # Calculate the subplot position
    row_pos = (i // num_cols) + 1
    col_pos = (i % num_cols) + 1

    # Add bar plot to the current subplot
    fig.add_trace(go.Bar(x=sorted_classes, y=sorted_counts), row=row_pos, col=col_pos)
    
    fig.update_xaxes(title_text=column, row=row_pos, col=col_pos, title_font=dict(size=10))

# Update layout and display the figure
fig.update_layout(height=400 * num_rows, width=400 * num_cols, showlegend=False)
fig.show()


In [None]:
temp = df[ex4_columns]
temp.index = temp['DT']
temp = temp.groupby(pd.Grouper(freq='1h', dropna=False)).mean()

In [None]:
num_features = len(temp.columns)
num_rows = int(num_features / 4) + (num_features % 4 > 0)
num_cols = 4

# Create subplots with calculated rows and columns
fig = make_subplots(rows=num_rows, cols=num_cols)

# Iterate over each feature and add box plots to the respective subplot
for i, feature in enumerate(temp.columns):
    # Calculate the subplot position
    row_pos = int(i / num_cols) + 1
    col_pos = (i % num_cols) + 1

    # Add box plot to the current subplot
    fig.add_trace(go.Box(y=temp[feature], name=feature), row=row_pos, col=col_pos)

# Update layout and display the figure
fig.update_layout(height=400 * num_rows, width=600 * num_cols, showlegend=False)
fig.show()

In [None]:
num_features = len(temp.columns)
num_rows = int(num_features / 4) + (num_features % 4 > 0)
num_cols = 4

# Create subplots with calculated rows and columns
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=temp.columns)

# Iterate over each feature and add histogram and violin plot to the respective subplot
for i, feature in enumerate(temp.columns):
    # Calculate the subplot position
    row_pos = int(i / num_cols) + 1
    col_pos = (i % num_cols) + 1

    # Add histogram to the current subplot
    fig.add_trace(go.Histogram(x=temp[feature], name=feature, histnorm='probability density'), row=row_pos, col=col_pos)
    # Add violin plot to the current subplot
    #fig.add_trace(go.Violin(y=temp[feature], name=feature, box_visible=True, meanline_visible=True), row=row_pos, col=col_pos)

# Update layout and display the figure
fig.update_layout(height=400 * num_rows, width=600 * num_cols, showlegend=False)
fig.show()


In [None]:
y_temp = y_train[ex4y_columns]
y_temp.index = y_temp['DT']
y_temp = y_temp.groupby(pd.Grouper(freq='1h', dropna=False)).last()
y_temp = y_temp.drop(['DT'], axis=1)

In [None]:
temp2 = temp[temp.index < pd.to_datetime("2019-06-16 13:20:00")]
y_temp2 = y_temp[y_temp.index < pd.to_datetime("2019-06-16 13:20:00")]

In [None]:
fig = make_subplots(rows=16, cols=1, subplot_titles=df.columns)

for i, column in enumerate(temp2.columns):
    fig.add_trace(go.Scatter(x=temp2.index, y=temp2[column], name=column), row=i+1, col=1)

# Update layout
fig.update_layout(height=4000, width=1200, title_text="Plot")
fig.update_xaxes(title_text="Time", row=15, col=1)
fig.update_yaxes(title_text="Value", row=8, col=1)

In [None]:
fig = make_subplots(rows=23, cols=1, subplot_titles=y_temp2.columns)

for i, column in enumerate(y_temp2.columns):
    fig.add_trace(go.Scatter(x=y_temp2.index, y=y_temp2[column], name=column), row=i+1, col=1)

# Update layout
fig.update_layout(height=4000, width=1200, title_text="Plot")
fig.update_xaxes(title_text="Time", row=15, col=1)
fig.update_yaxes(title_text="Value", row=8, col=1)

In [None]:
df.head()

In [None]:
xt_train = df[ex4_columns]
xt_train['id_c'] = 0
xt_train.index = xt_train['DT']
xt_train = xt_train.groupby(pd.Grouper(freq='1h')).median()

In [None]:
xt_train = xt_train.reset_index().dropna()

In [None]:
temp = xt_train[xt_train['DT'] < pd.to_datetime('2020-01-16')]

In [None]:
extracted_features = extract_features(temp, column_sort='DT', column_id='id_c')

In [None]:
excel_table = pd.read_excel("messages.xlsx")
excel_table.head()

In [None]:
excel_table.drop({})

In [None]:
excel_table.drop(["ТЕКСТ_ГРУППЫ_КОДОВ"], axis=1, inplace=True)

In [None]:
excel_table[excel_table['ИМЯ_МАШИНЫ'] == 'ЭКСГАУСТЕР А/М №4']

In [None]:
vibrations = ['DT', 'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 1',
       'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 2',
       'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 3',
       'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 3. ПРОДОЛЬНАЯ.',
       'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 4',
       'ЭКСГАУСТЕР 4. ВИБРАЦИЯ НА ОПОРЕ 4. ПРОДОЛЬНАЯ.']
df[vibrations]

In [None]:
df_detector_long = df[vibrations].melt(id_vars='DT', var_name='Reading_Source', value_name='Readings')

fig, ax = plt.subplots()

# Plotting the readings
sns.lineplot(data=df_detector_long, x='DT', y='Readings', hue='Reading_Source', ax=ax, palette='tab20')

# Loop over target dataframe to plot error intervals
for _, row in excel_table.iterrows():
    if row['ВИД_СООБЩЕНИЯ'] == 'M1':
        color = 'red'  # color for critical failures
    else:
        color = 'orange'  # color for non-critical failures
    ax.axvspan(row['ДАТА_НАЧАЛА_НЕИСПРАВНОСТИ'], row['ДАТА_УСТРАНЕНИЯ_НЕИСПРАВНОСТИ'], facecolor=color, alpha=0.5)

plt.show()
