In [0]:
# %pip install umap matplotlib
# %restart_python 

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import col, sum as _sum
from pyspark.sql.types import *
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from seaborn import color_palette
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [0]:
## Get data 
df = spark.sql('''
               select * from default.malware_detection
               ''')

## Columns should be renamed so that there will be no error regarding feature names
df = df.withColumnsRenamed({"id.orig_h": "id_orig_h",
                      "id.orig_p": "id_orig_p",
                      "id.resp_h": "id_resp_h",
                      "id.resp_p": "id_resp_p"})

In [0]:
## Check top 5 most common values in each column to detect nulls or no relevant values
for col in df.columns:
    df.groupBy(col).count().show(5)
    print("-"*100)


In [0]:
# The data seems to have unrelevant values (null values appear with string "NULL" or "-"), these columns can be dropped

cols = ["uid", "id_orig_h", "id_orig_p", "id_resp_h", "id_resp_p", "local_orig", 
                "local_resp", "service", "history","tunnel_parents"]

df = df.drop(*cols)

# df.display()

In [0]:
null_counts = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c) 
    for c in df.columns
])

null_counts.show()

In [0]:
class AnalisisDatosExploratorio:
    def __init__(self, df=None, path=None, num=None):
        if df is not None:
            self.__df = df
        else:
            self.__df = self.__cargarDatos(path, num)
    @property
    def df(self):
        return self.__df

    @df.setter
    def df(self, p_df):
        self.__df = p_df

    def analisisNumerico(self):
        self.__df = self.__df.select_dtypes(include=["number"])

    def analisisCompleto(self):
        self.__df = pd.get_dummies(self.__df)

    def __cargarDatos(self, path, num):
        if num == 1:
            df = pd.read_csv(path,
                             sep=",",
                             decimal=".",
                             index_col=0)
        elif num == 2:
            df = pd.read_csv(path,
                             sep=";",
                             decimal=".")
        else:
            raise ValueError("Invalid value for 'num' parameter.")

        non_numeric_cols = df.select_dtypes(exclude=["number"]).columns
        for col in non_numeric_cols:
            if df[col].dtype == "object":
                df[col] = pd.Categorical(df[col])
                df[col] = df[col].cat.codes

        return df

    def analisis(self):
        self.analisisNumerico()  # Exclude non-numeric columns
        self.__df = pd.DataFrame(StandardScaler().fit_transform(self.__df), columns=self.__df.columns,
                                 index=self.__df.index)
        print("Dimensiones:", self.__df.shape)
        print(self.__df.head())
        print(self.__df.describe())
        self.__df.dropna().describe()
        self.__df.mean(numeric_only=True)
        self.__df.median(numeric_only=True)
        self.__df.std(numeric_only=True, ddof=0)
        self.__df.max(numeric_only=True)
        self.__df.min(numeric_only=True)
        self.__df.quantile(np.array([0, .33, .50, .75, 1]), numeric_only=True)
        self.__graficosBoxplot()
        self.__funcionDensidad()
        self.__histograma()
        self.__correlaciones()
        self.__graficoDeCorrelacion()

    def __graficosBoxplot(self):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 8), dpi=200)
        boxplots = self.__df.boxplot(return_type='axes', ax=ax)
        plt.show()

    def __funcionDensidad(self):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8), dpi=200)
        densidad = self.__df[self.__df.columns].plot(kind='density', ax=ax)
        plt.show()

    def __histograma(self):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6), dpi=200)
        densidad = self.__df[self.__df.columns].plot(kind='hist', ax=ax)
        plt.show()

    def __correlaciones(self):
        corr = self.__df.corr(numeric_only=True)
        print(corr)

    def __graficoDeCorrelacion(self):
        fig, ax = plt.subplots(figsize=(12, 8), dpi=150)
        paleta = sns.diverging_palette(220, 10, as_cmap=True).reversed()
        corr = self.__df.corr(numeric_only=True)
        sns.heatmap(corr, vmin=-1, vmax=1, cmap=paleta, square=True, annot=True, ax=ax)
        plt.show()

    def centroide(num_cluster, datos, clusters):
        ind = clusters == num_cluster
        return (pd.DataFrame(datos[ind].mean()).T)

    def recodificar(col, nuevo_codigo):
        col_cod = pd.Series(col, copy=True)
        for llave, valor in nuevo_codigo.items():
            col_cod.replace(llave, valor, inplace=True)
        return col_cod

    def bar_plot(centros, labels, scale=False, cluster=None, var=None):
        fig, ax = plt.subplots(1, 1, figsize=(15, 8), dpi=200)
        centros = np.copy(centros)
        if scale:
            for col in range(centros.shape[1]):
                centros[:, col] = centros[:, col] / max(centros[:, col])
        colores = color_palette()
        minimo = floor(centros.min()) if floor(centros.min()) < 0 else 0

        def inside_plot(valores, labels, titulo):
            plt.barh(range(len(valores)), valores, 1 / 1.5, color=colores)
            plt.xlim(minimo, ceil(centros.max()))
            plt.title(titulo)

        if var is not None:
            centros = np.array([n[[x in var for x in labels]] for n in centros])
            colores = [colores[x % len(colores)] for x, i in enumerate(labels) if i in var]
            labels = labels[[x in var for x in labels]]
        if cluster is None:
            for i in range(centros.shape[0]):
                plt.subplot(1, centros.shape[0], i + 1)
                inside_plot(centros[i].tolist(), labels, ('Cluster ' + str(i)))
                plt.yticks(range(len(labels)), labels) if i == 0 else plt.yticks([])
        else:
            pos = 1
            for i in cluster:
                plt.subplot(1, len(cluster), pos)
                inside_plot(centros[i].tolist(), labels, ('Cluster ' + str(i)))
                plt.yticks(range(len(labels)), labels) if pos == 1 else plt.yticks([])
                pos += 1

    def bar_plot_detail(centros, columns_names=[], columns_to_plot=[], figsize=(10, 7), dpi=150):
        fig, ax = plt.subplots(1, 1, figsize=(15, 8), dpi=200)
        numClusters = centros.shape[0]
        labels = ["Cluster " + str(i) for i in range(numClusters)]
        centros = pd.DataFrame(centros, columns=columns_names, index=labels)
        plots = len(columns_to_plot) if len(columns_to_plot) != 0 else len(columns_names)
        rows, cols = ceil(plots / 2), 2
        plt.figure(1, figsize=figsize, dpi=dpi)
        plt.subplots_adjust(hspace=1, wspace=0.5)
        columns = columns_names
        if len(columns_to_plot) > 0:
            if type(columns_to_plot[0]) is str:
                columns = columns_to_plot
            else:
                columns = [columns_names[i] for i in columns_to_plot]
        var = 0
        for numRow in range(rows):
            for numCol in range(cols):
                if var < plots:
                    ax = plt.subplot2grid((rows, cols), (numRow, numCol), colspan=1, rowspan=1)
                    sns.barplot(y=labels, x=columns[var], data=centros, ax=ax)
                    var += 1

    def radar_plot(centros, labels):
        fig, ax = plt.subplots(1, 1, figsize=(15, 8), dpi=200)
        centros = np.array([((n - min(n)) / (max(n) - min(n)) * 100) if
                            max(n) != min(n) else (n / n * 50) for n in centros.T])
        angulos = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
        angulos += angulos[:1]
        ax = plt.subplot(111, polar=True)
        ax.set_theta_offset(pi / 2)
        ax.set_theta_direction(-1)
        plt.xticks(angulos[:-1], labels)
        ax.set_rlabel_position(0)
        plt.yticks([10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                   ["10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"],
                   color="grey", size=8)
        plt.ylim(-10, 100)
        for i in range(centros.shape[1]):
            valores = centros[:, i].tolist()
            valores += valores[:1]
            ax.plot(angulos, valores, linewidth=1, linestyle='solid',
                    label='Cluster ' + str(i))
            ax.fill(angulos, valores, alpha=0.3)
        plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    def __str__(self):
        return f'AnalisisDatosExploratorio: {self.__df}'


In [0]:
# Then instantiate:
eda_df = AnalisisDatosExploratorio(df=df_pandas)

eda_df.analisis()