# SLU06 - Dealing with Data Problems

In [1]:
import os
import pandas as pd

## Tidy Data

In [2]:
df_messy = pd.read_csv(os.path.join('data', 'column_headers_are_values.csv'), sep=' ')
df_messy

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k
0,Agnostic,27,34,60,81,76,137
1,Atheist,12,27,37,52,35,70
2,Buddhist,27,21,30,34,33,58
3,Catholic,418,617,732,670,638,1116
4,Don’t know/refused,15,14,15,11,10,35
5,Evangelical Prot,575,869,1064,982,881,1486
6,Hindu,1,9,7,9,11,34
7,Historically Black Prot,228,244,236,238,197,223
8,Jehovah’s Witness,20,27,24,24,21,30
9,Jewish,19,19,25,25,30,95


###  Handling variable values in the column names

In [3]:
df_tidy = pd.melt(
    df_messy,
    id_vars=['religion'],
    value_vars=list(filter(lambda x: x != 'religion', df_messy.columns.tolist())),
    var_name='income',
    value_name='freq'
)
df_tidy.head()

Unnamed: 0,religion,income,freq
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15


## Data Entry Problems

In [4]:
data = pd.read_csv('data/data_with_problems.csv', index_col=0)
data.head()

Unnamed: 0,age,height,gender
CFLOXRHMDR,88.0,163.0,female
FXLJSNLSOG,29.0,158.0,female
FWDIVJKGOI,42.0,159.0,female
YWEBKQWHRE,25.0,179.0,male
YPUQAPSOYJ,32.0,169.0,male


### Counting unique values in a column

In [5]:
data.gender.nunique()

7

### Value counts per unique value in a column

In [6]:
data.gender.value_counts(dropna=False)

female       109
male          66
NaN            9
MALE           9
m              3
F              2
   female      1
f              1
Name: gender, dtype: int64

### Fix data entry problems using string methods with .str

In [7]:
data.gender = data.gender.str.lower()
data.gender.value_counts()

female       109
male          75
m              3
f              3
   female      1
Name: gender, dtype: int64

In [8]:
data.gender = data.gender.str.strip()
data.gender.value_counts()

female    110
male       75
m           3
f           3
Name: gender, dtype: int64

In [9]:
data.gender.str.replace('^m$', 'male').str.replace('^f$', 'female').value_counts()

  """Entry point for launching an IPython kernel.


female    113
male       78
Name: gender, dtype: int64

### Fix data entry problems using the replace method

In [10]:
data.gender = data.gender.replace({'m': 'male', 'f': 'female'})
data.gender.value_counts()

female    113
male       78
Name: gender, dtype: int64

### Detecting duplicated values

In [11]:
duplicated_mask = data.duplicated(keep='first')

print('Number of duplicates:', duplicated_mask.sum())

Number of duplicates: 6


### Dropping duplicated values

In [12]:
print(f"Shape before dropping duplicates: {data.shape}")
data = data.drop_duplicates()
print(f"Shape after dropping duplicates: {data.shape}")

Shape before dropping duplicates: (200, 3)
Shape after dropping duplicates: (194, 3)


## Missing Values

### Detecting missing values

In [13]:
data.isnull().tail()

Unnamed: 0,age,height,gender
AGFHBQDTEG,False,False,False
HYTVHSPPVG,False,False,False
DSBFYTZEQN,False,False,False
VYAQBLJKXJ,True,False,False
BLAKTCGBMO,False,False,False


### Dropping missing values

In [14]:
print(f"Shape before dropping missing values: {data.shape}")
data_no_missing_values = data.dropna()
print(f"Shape after dropping missing values: {data_no_missing_values.shape}")

Shape before dropping missing values: (194, 3)
Shape after dropping missing values: (173, 3)


### Imputing missing values with mean/median or a new category

In [15]:
data = data.fillna({
    'age': data.age.median(),
    'height': data.height.median(),
    'gender': 'unknown'
})