In [None]:
import polars as pl
import pandas as pd
import preprocessing.feature_engineering as fe
import matplotlib.pyplot as plt
import seaborn as sns
import mplcyberpunk
import re

plt.style.use("cyberpunk")
plt.style.use('dark_background')

In [None]:
df_selected = pl.read_parquet("./data/2022_data_selected.parquet")

In [None]:
df_failure = df_selected.filter(pl.col("failure") == 1)

In [None]:
df_failure

In [None]:
df_all_modelcounts = df_failure['model'].value_counts()
df_all_modelcounts.sort(by="counts", descending=True)

In [None]:
df_ST4000DM000 = df_failure.filter((pl.col("model") == "ST4000DM000"))

In [None]:
failed_st4000 = df_ST4000DM000.select(
    [
        
        pl.col("serial_number").unique().alias("serial_number"),
    ])

In [None]:
df_ZJV3BYAY = df_selected.filter(pl.col("serial_number")=="ZJV3BYAY")

In [None]:
df_sorted1 = df_ZJV3BYAY.sort(by='date')

smart statistics for hard drive with serial number ZJV3BYAY

In [None]:
fig, ax = plt.subplots(32, figsize=(20,40))
for i in range(32):
    graph1 = sns.lineplot(data=df_sorted1, x='date', y=df_sorted1.columns[i+5], ax=ax[i])

plt.savefig("./images/ZJV3BYAY.png")

In [None]:
df_S3010LL5 = df_selected.filter(pl.col("serial_number")=="S3010LL5")
df_sorted2 = df_S3010LL5.sort(by='date')



smart statistics for hard drive with serial number S3010LL5


In [None]:
fig, ax = plt.subplots(32, figsize=(20,40))
for i in range(32):
    graph1 = sns.lineplot(data=df_sorted2, x='date', y=df_sorted2.columns[i+5], ax=ax[i])

plt.savefig("./images/S3010LL5.png")

In [None]:
list(failed_st4000)[0][:]

In [None]:
failed_st4000_list = failed_st4000.to_series().to_list()

In [None]:
#hard drive model ST4000DM000 thats not failed: hard drives that failed failed_st4000
df_nofailure = df_selected.filter((pl.col("model") == "ST4000DM000") & ~ pl.col("serial_number").is_in(failed_st4000_list))

In [None]:
df_nofailure

In [None]:
df_S300YQM3 = df_selected.filter(pl.col("serial_number")=="S300YQM3")
df_sorted3 = df_S300YQM3.sort(by='date')


#smart statistics for hard drive with serial number S3010LL5

fig, ax = plt.subplots(32, figsize=(20,40))
for i in range(32):
    graph1 = sns.lineplot(data=df_sorted3, x='date', y=df_sorted3.columns[i+5], ax=ax[i])

plt.savefig("./images/S300YQM3.png")

In [None]:
df_st4000 = df_selected.filter((pl.col("model") == "ST4000DM000"))

In [None]:
st400_dict = fe.create_faildate_dict(df_st4000)

In [None]:
df_st4000 = fe.create_faildate(df_st4000, st400_dict)

In [None]:
df_st4000 = fe.create_target_classification(df_st4000)



In [None]:
df_st4000.write_parquet("./data/2022_data_ST4000DM000.parquet")

### failure rate per hard drive model

In [None]:
df_unique_model = df_selected.select(["serial_number", "model"]).unique()

In [None]:
df_unique_grouped = df_unique_model.groupby("model").agg(pl.col("serial_number").count())

In [None]:
df_unique_grouped

In [None]:
df_unique_sorted = df_unique_grouped.sort(by='serial_number', descending=True)
df_unique_sorted = df_unique_sorted.to_pandas()

In [None]:
df_unique_sorted

In [None]:
sns.barplot(data=df_unique_sorted, x="serial_number" ,y='model',color="blue")
sns.set(rc={'figure.figsize':(11.7,12.27)})
plt.title("number of hard drives per model")
plt.xlabel("number of hard drives")
plt.yticks();

In [None]:
df_failure = df_selected.filter(pl.col("failure") == 1)

In [None]:
df_failure_sorted = df_failure['model'].value_counts().sort(by='counts', descending=True)
df_failure_sorted =df_failure_sorted.to_pandas()

In [None]:
sns.barplot(data=df_failure_sorted, x="counts" ,y='model',color="blue")
sns.set(rc={'figure.figsize':(11.7,12.27)})
plt.yticks();

### failure rate per model

In [None]:
df_fail_per_model = pd.concat([df_failure_sorted, df_unique_sorted], keys="model", axis=1)

In [None]:
df_fail_per_model = df_unique_sorted.join(df_failure_sorted.set_index("model"), on = "model")

In [None]:
df_fail_per_model.head(10)

In [None]:
df_fail_per_model["percentage"] = (100/ df_fail_per_model["serial_number"])* df_fail_per_model["counts"]

In [None]:
df_fail_per_model = df_fail_per_model.dropna()

In [None]:
df_fail_per_model = df_fail_per_model.sort_values("percentage", ascending=False)

In [None]:
df_fail_per_model.head(2)

create column with manufacturer

In [None]:
manu_dict = {"Toshiba" : ["TOSHIBA"], 
            "Seagate" : ["ST", "Seagate"],
            "HGST" : ["HGST"],
            "Western Digital" : ["WDC"],
            "Hitachi" : ["Hitachi"],
            "Micron":["MTFDD"],
            "Crucial":["CT"]}

In [None]:
toshiba_ex = "TOSHIBA"
seagate_ex = ["ST", "Seagate"]
hgst_ex = "HGST"
wd_ex = "WDC"
hitachi_ex = "Hitachi"
micron_ex = "MTFDD"
crucial_ex = "CT"

In [None]:
for key, value in manu_dict.items():
    print(key, value)

In [None]:
def re_function(str_var):
    for key, values in manu_dict.items():
        for value in values:
            if re.match(value, str_var):
                return key


In [None]:
df_fail_per_model["manufacturer"] = df_fail_per_model["model"].apply(lambda x: re_function(x))

In [None]:
df_fail_per_model

In [None]:
ax = sns.barplot(data=df_fail_per_model, x="percentage", y="model", hue="manufacturer", dodge=False, palette="tab20" )
#plt.plot([], [], ' ', label="298 total Number of hard drives")
for container in ax.containers:
    ax.bar_label(container=container, labels = df_fail_per_model["serial_number"]);
#ax.text
mplcyberpunk.add_glow_effects()


In [None]:
sns.light_palette("", as_cmap=True)

In [None]:
colors = [
    '#08F7FE',  # teal/cyan
    '#FF69B4',  # pink
    '#FFFF00',  # yellow
    '#00ff41', # matrix green
    "#584efd"
]

In [None]:
ax = sns.barplot(data=df_fail_per_model[df_fail_per_model["serial_number"]>= 100], x="percentage", y="model", hue="manufacturer", dodge=False,palette= colors)
#plt.plot([], [], ' ', label="298 total Number of hard drives")
for container in ax.containers:
    ax.bar_label(container=container, labels = df_fail_per_model[df_fail_per_model["serial_number"]>= 100]["serial_number"]);
#ax.text
