In [None]:
import pandas as pd
import numpy as np
import glob
import pyarrow as pa
import pyarrow.parquet as pq
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', 50)

In [None]:
df_all = pl.read_parquet('./data/2022_data_selected.parquet')

In [None]:
df_all.head()

In [None]:
df_all['serial_number'].value_counts()

In [None]:
df_all_modelcounts = df_all['model'].value_counts()
df_all_modelcounts

In [None]:
df_all

In [None]:
df_all['failure'].value_counts()

In [None]:
df_all.null_count().transpose()

In [None]:
def get_nan_count_percent(df, divisor=None):
    """Calculates the number of nan values per column,
        both as an absolute amount and as a percentage of some pre-defined "total" amount
        
        WARNING: Return value is of the same type (pd.DataFrame or dask.dataframe) as the
        input. It is up to the caller to handle this accordingly.
    
    Arguments:
        df {polars.DataFrame/dask.dataframe} -- dataframe whose nan count to generate
    
    Keyword Arguments:
        divisor {int/float} -- the "total" amount for calculating percentage. 
                                If value in count column is n, value in percent column
                                will be n/divisor.
                                If not provided, number of rows is used by default
                                (default: {None})
    
    Returns:
        ret_df {pandas.DataFrame/dask.dataframe} -- dataframe with counts and percentages
                                                    of nans in each column of input df.
                                                    Column name is the index, "count" and
                                                    "percent" are the two columns.
    """
    # if total count is not provided, use the number of rows
    if divisor is None:
        # NOTE: len must be used, not shape because in case of dask dataframe
        # shape returns a delayed computation, not an actual value. but
        # len returns an actual value
        divisor = len(df)

    # get count and convert series to dataframe
    ret_df = df.null_count().transpose()
    ret_df = ret_df.with_column(pl.lit(ret_df["column_0"] / divisor).alias('percent'))

    # add percent column
    #ret_df["percent"] = ret_df["column_0"] / divisor

    return ret_df

In [None]:
get_nan_count_percent(df_all).sort(by='percent')

In [None]:
df_Seagate = df_all.filter(pl.col("model") == "ST4000DM000")

In [None]:
df_Seagate['failure'].value_counts()

In [None]:
df = df_Seagate.to_pandas()
df.head()

In [None]:
df.dtypes

In [None]:
corr = df.corr(method="spearman")

In [None]:
sns.heatmap(corr)

In [None]:
df_Seagate.columns[4:]

In [None]:
grouping = df_Seagate.lazy().groupby("failure").agg(pl.mean(df_Seagate.columns[5:]))

In [None]:
df_Seagate.describe()

In [None]:
df_grouped = grouping.collect()

In [None]:
df_grouped = df_grouped.to_pandas()

In [None]:
df_grouped = df_grouped.round(2)

In [None]:
df_grouped

In [None]:
fig, ax= plt.subplots(16,2)
sns.set(rc={'figure.figsize':(30,60)})
row_counter = 0
for i in range(1,33):
    if i % 2 != 0:        
        graph1 = sns.barplot(data=df_grouped, x='failure', y=df_grouped.columns[i], ax=ax[row_counter,0],) 

    if i % 2 == 0:
        graph1 = sns.barplot(data=df_grouped, x='failure', y=df_grouped.columns[i], ax=ax[row_counter,1],) 

        row_counter += 1
plt.margins(x=0.0005)

plt.show()


In [None]:
df_failed = df[df['failure'] == 1]

In [None]:
sns.histplot(data=df, x='failure')

In [None]:
df_filtered = df[(df['serial_number'] == "Z305B8PX")]

In [None]:
sns.lineplot(data=df_filtered, x="date", y='capacity_bytes')

In [None]:
sns.histplot(data=df_Seagate, x='serial_number')
sns.set(rc={'figure.figsize':(11.7,12.27)})
plt.yticks();

In [None]:
df_Seagate.null_count()

In [None]:
df_all = df_all.with_columns(pl.col('date').str.strptime(pl.Date, fmt='%Y-%m-%d'), strict=False)

In [None]:
df_failure = df_all.filter(pl.col("failure") == 1)

In [None]:
df_failure

In [None]:
df_failure_sorted = df_failure['model'].value_counts().sort(by='counts', descending=True)

In [None]:
df_failure_sorted =df_failure_sorted.to_pandas()

In [None]:
df_failure_sorted =df_failure_sorted.to_pandas()


In [None]:
sns.barplot(data=df_failure_sorted, x="counts" ,y='model',color="blue")
sns.set(rc={'figure.figsize':(11.7,12.27)})
plt.yticks();

In [None]:
sns.barplot(data=df_all_modelcounts.to_pandas(), x="counts" ,y='model',color="blue")
sns.set(rc={'figure.figsize':(11.7,12.27)})
plt.yticks();


In [None]:
df_failed_example = df_all.filter(pl.col("serial_number") == "ZJV3BYAY")

In [None]:
df_sorted = df_failed_example.sort(by='date')

In [None]:
df_sorted

In [None]:
fig, ax = plt.subplots(32, figsize=(20,40))
for i in range(32):
    graph1 = sns.lineplot(data=df_sorted, x='date', y=df_sorted.columns[i+5], ax=ax[i])
    