In [2]:
import warnings

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from tqdm import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_parquet("./data/passwords.gzip", engine="pyarrow")
df.head()

Unnamed: 0,password,strength
0,123456,0.172331
1,12345,0.128996
2,123456789,0.316992
3,password,0.249543
4,iloveyou,0.249543


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14344094 entries, 0 to 14344096
Data columns (total 2 columns):
 #   Column    Dtype  
---  ------    -----  
 0   password  object 
 1   strength  float64
dtypes: float64(1), object(1)
memory usage: 328.3+ MB


In [5]:
def isValidPassword(text: str) -> int:
    """The isValidPassword function checks whether
    a given password meets certain criteria and
    returns an integer value indicating its validity.

    Args:
    ---
        text (str): The password to be validated.

    Returns:
    ---
        int: An integer value representing the validity
        of the password. It returns 1 if the password
        is valid, and 0 if it is not.
    """
    if len(text) < 4 or len(text) > 64:
        return 0

    text_set = set(text.lower())
    valid_set = set("qwertyuiopasdfghjklzxcvbnm1234567890!@#$%^&*")

    return 0 if text_set.difference(valid_set) else 1

In [6]:
df["IsValid"] = df["password"].progress_apply(lambda x: isValidPassword(x))

100%|██████████| 14344094/14344094 [00:42<00:00, 339507.14it/s]


In [7]:
mark = df["IsValid"] == 1

clean_df = df[mark]
clean_df

Unnamed: 0,password,strength,IsValid
0,123456,0.172331,1
1,12345,0.128996,1
2,123456789,0.316992,1
3,password,0.249543,1
4,iloveyou,0.249543,1
...,...,...,...
14343809,!!!!!!!!!!!!!!!!!!,0.000000,1
14343810,!!!!!!!!!!!!!!!!,0.000000,1
14343811,!!!!!!!!!!!!,0.000000,1
14343812,!!!!!!!!!,0.000000,1


In [8]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13644816 entries, 0 to 14343813
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   password  object 
 1   strength  float64
 2   IsValid   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 416.4+ MB


In [9]:
def describe_categorical(
    df: pd.DataFrame, columns: list[str], n: int = 25, include_missing: bool = True
) -> pd.DataFrame:
    """Generates a statistical summary of categorical variables in a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the categorical variables.

        columns (list[str]): A list of column names to include in the summary.
        If None, all categorical columns in the DataFrame will be included.

        n (int, optional): The number of top values to include in the summary.
        Defaults to 25.

        include_missing (bool, optional): Specifies whether to include missing values
        in the summary. Defaults to True.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the count and percentage of
        each value in the categorical variables.
    """
    if columns is None:
        columns = df.select_dtypes(include=["category"]).columns.tolist()
    stats = pd.DataFrame(columns=["count", "percentage"])
    total = df.shape[0]
    for col in columns:
        values, counts = _get_categorical_stats(df, col, n, include_missing)
        stats_col = pd.DataFrame({"column": col, "value": values, "count": counts})
        stats_col["percentage"] = stats_col["count"] / total * 100
        stats = pd.concat([stats, stats_col], ignore_index=False)
    stats.set_index(["column", "value", "count", "percentage"], inplace=True)
    return stats.sort_index()


def _get_categorical_stats(
    df: pd.DataFrame, col: str, n: int, include_missing: bool
) -> tuple[list[int], list[int]]:
    """Retrieves the count and percentage statistics for a specific categorical variable.

    Args:
        df (pd.DataFrame): The DataFrame containing the categorical variable.
        col (str): The name of the categorical column.
        n (int): The number of top values to retrieve.
        include_missing (bool): Specifies whether to include missing values.

    Returns:
        tuple[list[int], list[int]]: A tuple of two lists, where the first list contains
        the values and the second list contains the corresponding counts for the categorical variable.
    """
    if include_missing:
        values, counts = (
            df[col].value_counts(dropna=False).index.tolist(),
            df[col].value_counts(dropna=False).values.tolist(),
        )
    else:
        values, counts = (
            df[col].value_counts().index.tolist(),
            df[col].value_counts().values.tolist(),
        )
    if n is not None:
        values = values[:n]
        counts = counts[:n]
    return values, counts

In [10]:
describe_categorical(df, columns=["IsValid"])

column,value,count,percentage
IsValid,0.0,699278,4.875024
IsValid,1.0,13644816,95.124976


In [11]:
def plot_graph_categorical(data: pd.Series) -> None:
    """Plots a bar chart and pie chart to visualize the distribution of a categorical variable.

    Args:
        data (pd.Series): The categorical data to be plotted.
    """
    values, counts = np.unique(data, return_counts=True)
    sorted_idx = np.argsort(-counts)
    values = values[sorted_idx]
    counts = counts[sorted_idx]

    fig = sp.make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(f"Count of {data.name}", f"Distribution of {data.name}"),
        specs=[[{"type": "bar"}, {"type": "pie"}]],
    )

    # Bar plot
    fig.add_trace(
        go.Bar(
            x=values,
            y=counts,
            name="Count",
        ),
        row=1,
        col=1,
    )

    fig.update_xaxes(title_text=data.name, row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=1)

    # Pie plot
    fig.add_trace(
        go.Pie(
            labels=values,
            values=counts,
            textinfo="percent",
            hoverinfo="label+percent",
        ),
        row=1,
        col=2,
    )

    fig.update_traces(
        textfont=dict(color="#ffffff"),
        marker=dict(line=dict(color="#ffffff", width=1)),
    )

    fig.update_layout(
        title_text=f"Univariate Analysis of the {data.name} Variable",
        height=500,
        width=1000,
        showlegend=False,
    )

    fig.show()

In [12]:
plot_graph_categorical(df["IsValid"])

In [13]:
clean_df.drop(["IsValid"], axis=1, inplace=True)

In [14]:
clean_df.to_parquet("./data/clean_passwords.gzip", compression="gzip", engine="pyarrow")