In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
from collections import Counter
from wordcloud import WordCloud
from nltk import ngrams

In [3]:
df = pd.read_csv("./data/transform_sample_data.csv")
df.head()

Unnamed: 0,password,strength,len,alphaUC,alphaLC,number,symbol,midChar,repChar,uniqueChar,consecAlphaUC,consecAlphaLC,consecNumber,consecSymbol,seqAlpha,seqNumber,seqKeyboard
0,csillik,0.180594,7,0,7,0,0,0,2,5,0,1,0,0,0,0,0
1,huniihuu,0.177778,8,0,8,0,0,0,4,4,0,2,0,0,0,0,0
2,chaipy,0.172331,6,0,6,0,0,0,0,6,0,0,0,0,0,0,0
3,876876b,0.155556,7,0,1,6,0,5,3,4,0,0,0,0,0,1,0
4,miiwhy,0.154795,6,0,6,0,0,0,1,5,0,1,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   password       10000 non-null  object 
 1   strength       10000 non-null  float64
 2   len            10000 non-null  int64  
 3   alphaUC        10000 non-null  int64  
 4   alphaLC        10000 non-null  int64  
 5   number         10000 non-null  int64  
 6   symbol         10000 non-null  int64  
 7   midChar        10000 non-null  int64  
 8   repChar        10000 non-null  int64  
 9   uniqueChar     10000 non-null  int64  
 10  consecAlphaUC  10000 non-null  int64  
 11  consecAlphaLC  10000 non-null  int64  
 12  consecNumber   10000 non-null  int64  
 13  consecSymbol   10000 non-null  int64  
 14  seqAlpha       10000 non-null  int64  
 15  seqNumber      10000 non-null  int64  
 16  seqKeyboard    10000 non-null  int64  
dtypes: float64(1), int64(15), object(1)
memory usage: 1

In [5]:
def describe_numerical(df: pd.DataFrame) -> pd.DataFrame:
    """ Generates descriptive statistics for numerical columns in a DataFrame, including measures of central tendency, dispersion, skewness, kurtosis, range, interquartile range (IOR), and identifies outliers.

    Args:
        df (pd.DataFrame): The input DataFrame containing numerical columns.

    Returns:
        pd.DataFrame: A DataFrame containing descriptive statistics for the numerical columns, including measures of central tendency, dispersion, skewness, kurtosis, range, interquartile range (IOR), and the count of outliers for each numerical column.
    """
    _df = df.select_dtypes('number')
    stats = _df.describe()
    stats.loc["var"] = _df.var().tolist()
    stats.loc["skew"] = _df.skew().tolist()
    stats.loc["kurtosis"] = _df.kurtosis().tolist()
    stats.loc["range"] = stats.loc["max"] - stats.loc["min"]
    stats.loc["IOR"] = stats.loc["75%"] - stats.loc["25%"]
    stats.loc["lower_bound"] = stats.loc["25%"] - 1.5 * stats.loc["IOR"]
    stats.loc["upper_bound"] = stats.loc["75%"] + 1.5 * stats.loc["IOR"]
    outliers = (_df < stats.loc["lower_bound"]) | (_df > stats.loc["upper_bound"])
    outliers_count = outliers.sum()
    stats.loc["outliers"] = outliers_count.tolist()
    return stats

In [6]:
describe_numerical(df).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurtosis,range,IOR,lower_bound,upper_bound,outliers
strength,10000.0,0.484839,0.256372,0.044444,0.229774,0.471037,0.694474,0.99948,0.065726,0.124583,-1.347718,0.955035,0.4647,-0.467275,1.391523,0.0
len,10000.0,12.9274,5.953207,4.0,8.0,12.0,16.0,60.0,35.440673,1.032867,2.196673,56.0,8.0,-4.0,28.0,115.0
alphaUC,10000.0,0.5597,2.435617,0.0,0.0,0.0,0.0,32.0,5.932229,6.233278,46.990559,32.0,0.0,0.0,0.0,976.0
alphaLC,10000.0,9.1299,6.684152,0.0,5.0,8.0,13.0,58.0,44.677894,0.743571,0.903054,58.0,8.0,-7.0,25.0,151.0
number,10000.0,3.1773,3.954571,0.0,0.0,2.0,5.0,48.0,15.638629,2.153771,8.766552,48.0,5.0,-7.5,12.5,229.0
symbol,10000.0,0.0605,0.594033,0.0,0.0,0.0,0.0,36.0,0.352875,34.53216,1742.06661,36.0,0.0,0.0,0.0,330.0
midChar,10000.0,2.5454,3.563034,0.0,0.0,1.0,4.0,47.0,12.695208,2.670036,13.01452,47.0,4.0,-6.0,10.0,292.0
repChar,10000.0,3.5439,3.540131,0.0,1.0,3.0,5.0,53.0,12.532526,2.69774,15.859663,53.0,4.0,-5.0,11.0,339.0
uniqueChar,10000.0,9.3835,3.534526,2.0,6.0,9.0,12.0,36.0,12.492877,0.376441,-0.128453,34.0,6.0,-3.0,21.0,9.0
consecAlphaUC,10000.0,0.0174,0.164621,0.0,0.0,0.0,0.0,7.0,0.0271,15.556556,415.20949,7.0,0.0,0.0,0.0,143.0
