In [1]:
import pandas as pd
from polluters import ConsistentRepresentationPolluter

# Pollution

## Consistent Representation on Contraceptive Method Choice dataset

In [2]:
column_names = ["Wife's age", "Wife's education", "Husband's education", "Number of children", "Wife's religion", "Wife's now working?", "Husband's occupation", "Standard-of-living index", "Media exposure", "Contraceptive method used"]
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data", names=column_names, header=None)
clean_df = df.copy()

In [3]:
df

Unnamed: 0,Wife's age,Wife's education,Husband's education,Number of children,Wife's religion,Wife's now working?,Husband's occupation,Standard-of-living index,Media exposure,Contraceptive method used
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1
...,...,...,...,...,...,...,...,...,...,...
1468,33,4,4,2,1,0,2,4,0,3
1469,33,4,4,3,1,1,1,4,0,3
1470,39,3,3,8,1,0,1,4,0,3
1471,33,3,3,4,1,0,2,2,0,3


In [4]:
set(clean_df["Husband\'s occupation"])

{1, 2, 3, 4}

In [5]:
clean_df['Media exposure'].replace({0: 'Good', 1: 'Not good'}, inplace=True)

In [6]:
set(clean_df['Media exposure'])

{'Good', 'Not good'}

In [7]:
clean_df['Media exposure'].value_counts()

Good        1364
Not good     109
Name: Media exposure, dtype: int64

In [8]:
clean_df["Husband\'s occupation"].value_counts()

3    585
1    436
2    425
4     27
Name: Husband's occupation, dtype: int64

In [9]:
clean_df[["Husband\'s occupation", "Media exposure"]]

Unnamed: 0,Husband's occupation,Media exposure
0,2,Good
1,3,Good
2,3,Good
3,3,Good
4,3,Good
...,...,...
1468,2,Good
1469,1,Good
1470,1,Good
1471,2,Good


In [19]:
df.shape

(1473, 10)

In [20]:
polluter = ConsistentRepresentationPolluter(random_seed=43, percentage_polluted_rows=0.5, num_pollutable_columns=7,
                                            number_of_representations={"Husband\'s occupation": {1: 2, 2: 5, 3: 3, 4: 2},
                                                                       "Media exposure": {'Good': 5, 'Not good': 2}})
polluted_df, quality = polluter(clean_df)

In [21]:
quality_overall, quality_pollutables = quality
print(quality_overall)
print(quality_pollutables)

0.8905483895300595
0.8592765008243624


In [22]:
polluter.new_presentations

{"Husband's occupation": {1: [5, 1],
  2: [6, 7, 8, 9, 2],
  3: [10, 11, 3],
  4: [12, 4]},
 'Media exposure': {'Good': ['Good-1', 'Good-2', 'Good-3', 'Good-4', 'Good'],
  'Not good': ['Not good-1', 'Not good']}}

In [23]:
polluted_df[["Husband\'s occupation", "Media exposure"]]

Unnamed: 0,Husband's occupation,Media exposure
0,2,Good
1,3,Good-4
2,3,Good
3,11,Good
4,3,Good-2
...,...,...
1468,6,Good
1469,5,Good
1470,1,Good-2
1471,9,Good


In [24]:
a = [clean_df["Husband\'s occupation"], polluted_df["Husband\'s occupation"]]
pd.concat(a, axis=1)

Unnamed: 0,Husband's occupation,Husband's occupation.1
0,2,2
1,3,3
2,3,3
3,3,11
4,3,3
...,...,...
1468,2,6
1469,1,5
1470,1,1
1471,2,9


In [25]:
a = [clean_df["Media exposure"], polluted_df["Media exposure"]]
pd.concat(a, axis=1)

Unnamed: 0,Media exposure,Media exposure.1
0,Good,Good
1,Good,Good-4
2,Good,Good
3,Good,Good
4,Good,Good-2
...,...,...
1468,Good,Good
1469,Good,Good
1470,Good,Good-2
1471,Good,Good


In [26]:
print({i: polluted_df["Husband\'s occupation"].value_counts()[i] for i in [1,5]})
print({i: polluted_df["Husband\'s occupation"].value_counts()[i] for i in [2,6,7,8,9]})
print({i: polluted_df["Husband\'s occupation"].value_counts()[i] for i in [3, 10, 11]})
print({i: polluted_df["Husband\'s occupation"].value_counts()[i] for i in [4, 12]})
print({i: polluted_df['Media exposure'].value_counts()[i] for i in ['Good', 'Good-1', 'Good-2', 'Good-3', 'Good-4']})
print({i: polluted_df['Media exposure'].value_counts()[i] for i in ['Not good', 'Not good-1']})


{1: 209, 5: 227}
{2: 218, 6: 46, 7: 59, 8: 49, 9: 53}
{3: 294, 10: 143, 11: 148}
{4: 16, 12: 11}
{'Good': 684, 'Good-1': 157, 'Good-2': 172, 'Good-3': 171, 'Good-4': 180}
{'Not good': 53, 'Not good-1': 56}
