Skip to content

Commit

Permalink
MAINT: Improve with mutmut
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Oct 19, 2019
1 parent 47b9587 commit 154b8dd
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 72 deletions.
172 changes: 101 additions & 71 deletions edapy/csv/describe.py
Expand Up @@ -3,93 +3,33 @@

# Core Library
import logging
from typing import Any, Dict, List, Tuple

# Third party
import pandas as pd

def describe_pandas_df(df, dtype=None):

def describe_pandas_df(
df: pd.DataFrame, dtype: Dict[str, Any] = None
) -> Dict[str, Any]:
"""
Show basic information about a pandas dataframe.
Parameters
----------
df : Pandas Dataframe object
dtype : dict
df : pd.DataFrame
dtype : Dict[str, Any]
Maps column names to types
Returns
-------
column_types : dict
column_types : Dict[str, Any]
Maps column names to type names
"""
if dtype is None:
dtype = {}
print("Number of datapoints: {datapoints}".format(datapoints=len(df)))
column_info = {"int": [], "float": [], "category": [], "other": [], "time": []}
float_types = ["float64"]
integer_types = ["int64", "uint8"]
time_types = ["datetime64[ns]"]
other_types = ["object", "category"]
column_info_meta = {}
for column_name in df:
column_info_meta[column_name] = {}
counter_obj = df[column_name].value_counts()
value_list = counter_obj.keys().tolist()
value_count = len(value_list)
is_suspicious_cat = (
value_count <= 50
and str(df[column_name].dtype) != "category"
and column_name not in dtype
)
if is_suspicious_cat:
logging.warning(
"Column '{}' has only {} different values ({}). "
"You might want to make it a 'category'".format(
column_name, value_count, value_list
)
)
if len(value_list) > 0:
top_count_val = counter_obj.tolist()[0]
else:
top_count_val = None
column_info_meta[column_name]["top_count_val"] = top_count_val
column_info_meta[column_name]["value_list"] = value_list
column_info_meta[column_name]["value_count"] = value_count
is_int_type = (
df[column_name].dtype in integer_types
or column_name in dtype
and dtype[column_name] in integer_types
)
is_float_type = (
df[column_name].dtype in float_types
or column_name in dtype
and dtype[column_name] in float_types
)
is_cat_type = (
str(df[column_name].dtype) == "category"
or column_name in dtype
and dtype[column_name] == "category"
)
is_time_type = str(df[column_name].dtype) in time_types
is_other_type = (
str(df[column_name].dtype) in other_types
or column_name in dtype
and dtype[column_name] in other_types
)
if is_int_type:
column_info["int"].append(column_name)
elif is_float_type:
column_info["float"].append(column_name)
elif is_cat_type:
column_info["category"].append(column_name)
elif is_other_type:
column_info["other"].append(column_name)
elif is_time_type:
column_info["time"].append(column_name)
else:
print(
"!!! describe_pandas_df does not know type '{}'".format(
df[column_name].dtype
)
)
column_info, column_info_meta = _generate_column_info(df, dtype)

column_name_len = max(len(column_name) for column_name in df)

Expand Down Expand Up @@ -194,3 +134,93 @@ def describe_pandas_df(df, dtype=None):
column_type = "str"
column_types[column_name] = column_type
return column_types


def _generate_column_info(
df: pd.DataFrame, dtype: Dict[str, Any]
) -> Tuple[Dict[str, List], Dict[str, Any]]:
"""
Parameters
----------
df : pd.DataFrame
dtype : Dict[str, Any]
Returns
-------
column_info, column_info_meta : Tuple
"""
column_info: Dict[str, List] = {
"int": [],
"float": [],
"category": [],
"other": [],
"time": [],
}
float_types = ["float64"]
integer_types = ["int64", "uint8"]
time_types = ["datetime64[ns]"]
other_types = ["object", "category"]
column_info_meta: Dict[str, Any] = {}
for column_name in df:
column_info_meta[column_name] = {}
counter_obj = df[column_name].value_counts()
value_list = counter_obj.keys().tolist()
value_count = len(value_list)
is_suspicious_cat = (
value_count <= 50
and str(df[column_name].dtype) != "category"
and column_name not in dtype
)
if is_suspicious_cat:
logging.warning(
"Column '{}' has only {} different values ({}). "
"You might want to make it a 'category'".format(
column_name, value_count, value_list
)
)
if len(value_list) > 0:
top_count_val = counter_obj.tolist()[0]
else:
top_count_val = None
column_info_meta[column_name]["top_count_val"] = top_count_val
column_info_meta[column_name]["value_list"] = value_list
column_info_meta[column_name]["value_count"] = value_count
is_int_type = (
df[column_name].dtype in integer_types
or column_name in dtype
and dtype[column_name] in integer_types
)
is_float_type = (
df[column_name].dtype in float_types
or column_name in dtype
and dtype[column_name] in float_types
)
is_cat_type = (
str(df[column_name].dtype) == "category"
or column_name in dtype
and dtype[column_name] == "category"
)
is_time_type = str(df[column_name].dtype) in time_types
is_other_type = (
str(df[column_name].dtype) in other_types
or column_name in dtype
and dtype[column_name] in other_types
)
if is_int_type:
column_info["int"].append(column_name)
elif is_float_type:
column_info["float"].append(column_name)
elif is_cat_type:
column_info["category"].append(column_name)
elif is_other_type:
column_info["other"].append(column_name)
elif is_time_type:
column_info["time"].append(column_name)
else:
print(
"!!! describe_pandas_df does not know type '{}'".format(
df[column_name].dtype
)
)
return column_info, column_info_meta
31 changes: 30 additions & 1 deletion tests/test_csv_describe.py
Expand Up @@ -8,7 +8,7 @@
import edapy.csv


def test_make_path_absolute():
def test_describe_pandas_df():
df = pd.DataFrame(
{
"a": [1, 2, 3],
Expand All @@ -20,3 +20,32 @@ def test_make_path_absolute():
out = edapy.csv.describe_pandas_df(df, dtype=None)
exp = {"a": "int", "b": "float", "c": "str", "d": "time"}
assert out == exp


def test_generate_column_info():
df = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [1.0, 2.0, 3.0],
"c": ["a", "b", "c"],
"d": [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 3)],
}
)
column_info, column_info_meta = edapy.csv.describe._generate_column_info(
df, dtype={}
)
column_info_expected = {
"int": ["a"],
"float": ["b"],
"other": ["c"],
"time": ["d"],
"category": [],
}
assert column_info == column_info_expected
assert column_info_meta["a"]["top_count_val"] == 1
assert column_info_meta["a"]["value_count"] == 3
assert sorted(column_info_meta["a"]["value_list"]) == sorted([1, 2, 3])
assert column_info_meta["b"]["top_count_val"] == 1
assert column_info_meta["b"]["value_count"] == 3
assert column_info_meta["c"]["top_count_val"] == 1
assert column_info_meta["c"]["value_count"] == 3

0 comments on commit 154b8dd

Please sign in to comment.