In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [None]:
print('Train')
print(train.head())

print('Test')
print(test.head())
print()

print('Dataset Info')
# sequence = a unique id for each sequence
print(f'Shape: train = {train.shape}, train_labels = {train_labels.shape}, test = {test.shape}')
print(f'Sequence numbering in train: from {train.sequence.min()} to {train.sequence.max()}')
print(f'Sequence numbering in train_labels: from {train_labels.sequence.min()} to {train_labels.sequence.max()}')
print(f'Sequence numbering in test: from {test.sequence.min()} to {test.sequence.max()}')
print(f'Step numbering: from {train.step.min()} to {train.step.max()}')
print()
# subject = a unique id for the subject in the experiment
print(f'Subject numbering in train: from {train.subject.min()} to {train.subject.max()}')
print(f'Subject numbering in test: from {test.subject.min()} to {test.subject.max()}')
print(f'States: {np.unique(train_labels.state)}')

In [None]:
figure = plt.figure(figsize=(16, 8))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4, 4, sensor+1)
    # bins = the number of bins that your data will be divided into
    plt.hist(train[sensor_name], bins=100)
    plt.title(f"{sensor_name} histogram")
    
figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle('Sensor Histograms Before Outlier Removal', y=1.02)
plt.show()

In [None]:
print(f"Sensor_08: {np.unique(train['sensor_08'])}")

In [None]:
figure = plt.figure(figsize=(16, 8))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4, 4, sensor+1)
    plt.hist(train[sensor_name], bins=100,
             # remove 2% outliers
             range=(train[sensor_name].quantile(0.02),
                    train[sensor_name].quantile(0.98)))
    plt.title(f"{sensor_name} histogram")

figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle('Sensor Histograms After Outlier Removal', y=1.02)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sensors = [col for col in train.columns if 'sensor_' in col]
plt.figure(figsize=(15, 7))

hm = sns.heatmap(train[sensors].corr(),
                cmap = "Blues",
                annot = True,
                fmt = '.1f',
                linewidths=0.05)

plt.title('Correlation Heatmap for Train Dataset',
         fontsize=15,
         fontweight='bold')

In [None]:
corr_col = ["sensor_00","sensor_01","sensor_03","sensor_06","sensor_07","sensor_09","sensor_11"]
plt.figure(figsize=(15, 7))

hm = sns.heatmap(train[corr_col].corr(),
                cmap = "Blues",
                annot = True,
                fmt = '.1f',
                linewidths=0.05)

plt.title('Correlation Heatmap for highly correlated columns',
         fontsize=15,
         fontweight='bold')

In [None]:
labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
# https://www.kaggle.com/code/dmitryuarov/sensors-deep-analysis-0-98/notebook

import plotly as py
import plotly.graph_objs as go
import plotly.express as px

def color(x):
    if x <= 0.25:
        return 0
    elif x >= 0.75:
        return 0.5
    else:
        return 1

sub_stat = labels.merge(train[['sequence', 'subject']], on='sequence', how='left')\
.drop_duplicates().groupby('subject').agg({'state':['mean', 'count']}).reset_index()
sub_stat.columns = sub_stat.columns.map('_'.join)
sub_stat['text'] = 'Subject - <b>' + sub_stat["subject_"].astype('str') + \
'</b> <br>State - <b>' + round(sub_stat["state_mean"], 2).astype('str') + \
'</b> <br>Count - <b>' + sub_stat["state_count"].astype('str') + '</b> <extra></extra>'

fig = go.Figure()
fig.add_trace(go.Scatter( 
    x = sub_stat['subject_'], 
    y = sub_stat['state_mean'],
    mode = 'markers',
    marker=dict(
        size=sub_stat['state_count']*0.3,
        color=((sub_stat['state_mean'].apply(lambda x: color(x)))),
        colorscale=[[0, '#c21b1b'], [0.5, '#21a5de'], [1, '#ffdc2b']],
        line=dict(width=0.1, color='black')
    ),
    hovertemplate = sub_stat['text']
))

fig.update_layout(width = 1150, height=600, plot_bgcolor = 'white', title = 'Subject states', 
                  title_font_size = 27, title_x = 0.5, title_y = 0.9,
                  font_family="Calibri", font_color="black")

fig.update_yaxes(title_text='Mean state', showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_size = 12, tickvals=[0.0, 0.25, 0.50, 0.75, 1.0])

fig.update_xaxes(title_text='Subject', showline = True, linecolor = '#f5f2f2')
fig.show()

In [None]:
sub_stat.head()