-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_investigation.py
47 lines (36 loc) · 1.56 KB
/
data_investigation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import traceback
import pandas as pd
import numpy as np
def print_columns(houses_df: pd.DataFrame):
print(f'houses_df.columns: {houses_df.columns}')
print('\n' + '-' * 90 + '\n')
__value_counts(houses_df, True)
def get_features(houses_df: pd.DataFrame, exclude_threshold=0.8):
exclude_columns = __get_excluded_features(houses_df, exclude_threshold)
return houses_df.drop(exclude_columns, axis=1)
def __get_excluded_features(houses_df: pd.DataFrame, exclude_threshold):
exclude_columns = {'Id': 'removed'}
for col in houses_df.columns:
col_value_count: pd.Series = houses_df[col].value_counts(normalize=True, dropna=False)
first_val = col_value_count.values[0]
col_desc = houses_df[col].describe()
if first_val and first_val > exclude_threshold:
exclude_columns[col] = first_val
elif houses_df[col].dtype == np.object:
count = col_desc['count']
try:
freq = col_desc['freq'] / count
if freq > exclude_threshold:
exclude_columns[col] = freq
except KeyError as e:
pass
except Exception:
traceback.print_exc()
return exclude_columns
def __value_counts(houses_df: pd.DataFrame, verbose=False):
for col in houses_df.columns:
col_value_counts = houses_df[col].value_counts(normalize=True, dropna=False)
if verbose:
print(f'\n\nColumn: {col}:')
print(f'Total of {len(col_value_counts)} unique values:')
print(col_value_counts)