In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
file_path = "..\Python_Activities\demographics.csv"
main_data = pd.read_csv(file_path)
print(main_data.head(5))

  first_name last_name blood_type marriage_status  income     device
0      Abdul     Colon         A+         married  145000  AndroidOS
1      Abdul    Pierce         B+         married   85000      MacOS
2    Desirae    Pierce         B+         MARRIED  130000        iOS
3    Shannon    Gibson         A+         married  175000      MacOS
4    Desirae    Little         B+      unmarried   130000      MacOS


In [None]:
# Identifying and Removing Bogus Blood Types
valid_blood_type_set = set(['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-'])
unique_main_data_blood_type_set = set(main_data['blood_type'].unique())
bogus_blood_types = unique_main_data_blood_type_set.difference(valid_blood_type_set)
print(bogus_blood_types)

bogus_records_index = main_data['blood_type'].isin(bogus_blood_types)

without_bogus_records = main_data[~bogus_records_index].copy()
without_bogus_records['blood_type'].unique()

{'C+', 'D-'}


array(['A+', 'B+', 'A-', 'AB-', 'AB+', 'B-', 'O-', 'O+'], dtype=object)

In [None]:
# Handling Inconsistent Marriage Status Categories
main_data['marriage_status'].unique()

array(['married', 'MARRIED', ' married', 'unmarried ', 'divorced',
       'unmarried', 'UNMARRIED', 'separated'], dtype=object)

In [50]:
inconsistent_data = main_data.copy()
inconsistent_data['marriage_status'] = inconsistent_data['marriage_status'].str.lower()
inconsistent_data['marriage_status'] = inconsistent_data['marriage_status'].str.strip()
inconsistent_data['marriage_status'].unique()

array(['married', 'unmarried', 'divorced', 'separated'], dtype=object)

In [26]:
# Grouping Income into Meaningful Bins
print(f"Max income: {main_data['income'].max()}, Min income: {main_data['income'].min()}")


Max income: 190000, Min income: 40000


In [40]:
income_bins = [40000, 75000, 100000, 125000, 150000, np.inf]
income_labels = ['40k-75k', '75k-100k', '100k-125k', '125k-150k', '150k+']

remapping_data = main_data.copy()
remapping_data['income_groups'] = pd.cut(
    remapping_data['income'],
    bins = income_bins,
    labels= income_labels,
    include_lowest= True
)
remapping_data.head()


Unnamed: 0,first_name,last_name,blood_type,marriage_status,income,device,income_groups
0,Abdul,Colon,A+,married,145000,AndroidOS,125k-150k
1,Abdul,Pierce,B+,married,85000,MacOS,75k-100k
2,Desirae,Pierce,B+,MARRIED,130000,iOS,125k-150k
3,Shannon,Gibson,A+,married,175000,MacOS,150k+
4,Desirae,Little,B+,unmarried,130000,MacOS,125k-150k


In [44]:
#Encoding Categorical Data
    #Label Encoding
le = LabelEncoder()
without_bogus_records['blood_type_encoded'] = le.fit_transform(without_bogus_records['blood_type'])
without_bogus_records[['blood_type','blood_type_encoded']].drop_duplicates()


Unnamed: 0,blood_type,blood_type_encoded
0,A+,0
1,B+,4
5,A-,1
7,AB-,3
9,AB+,2
10,B-,5
12,O-,7
21,O+,6


In [None]:
#One-hot Encoding(with pandas)
inconsistent_data = pd.get_dummies(inconsistent_data, columns=['marriage_status'])
inconsistent_data.head()

Unnamed: 0,first_name,last_name,blood_type,income,device,marriage_status_divorced,marriage_status_married,marriage_status_separated,marriage_status_unmarried
0,Abdul,Colon,A+,145000,AndroidOS,False,True,False,False
1,Abdul,Pierce,B+,85000,MacOS,False,True,False,False
2,Desirae,Pierce,B+,130000,iOS,False,True,False,False
3,Shannon,Gibson,A+,175000,MacOS,False,True,False,False
4,Desirae,Little,B+,130000,MacOS,False,False,False,True


In [57]:
#One-hot Encoding(with Scikit)
categorical_columns = ['marriage_status']
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoder = encoder.fit_transform(inconsistent_data[categorical_columns])
one_hot_inconsistent_data = pd.DataFrame(one_hot_encoder,columns=encoder.get_feature_names_out(categorical_columns))
inconsistent_data_encoded = pd.concat([inconsistent_data,one_hot_inconsistent_data],axis=1)
inconsistent_data_encoded = inconsistent_data_encoded.drop(categorical_columns, axis=1)
inconsistent_data_encoded.head()

Unnamed: 0,first_name,last_name,blood_type,income,device,marriage_status_divorced,marriage_status_married,marriage_status_separated,marriage_status_unmarried
0,Abdul,Colon,A+,145000,AndroidOS,0.0,1.0,0.0,0.0
1,Abdul,Pierce,B+,85000,MacOS,0.0,1.0,0.0,0.0
2,Desirae,Pierce,B+,130000,iOS,0.0,1.0,0.0,0.0
3,Shannon,Gibson,A+,175000,MacOS,0.0,1.0,0.0,0.0
4,Desirae,Little,B+,130000,MacOS,0.0,0.0,0.0,1.0


In [58]:
#Ordinal Encoding ************ TO DO AGAIN **************
custom_map = {
    '40k-75k': 1,
    '75k-100k': 2,
    '100k-125k': 3,
    '125k-150k': 4,
    '150k+': 5
}

remapping_data['income_groups_encoded'] = remapping_data['income_groups'].map(custom_map)

remapping_data[['income', 'income_groups', 'income_groups_encoded']].head()

Unnamed: 0,income,income_groups,income_groups_encoded
0,145000,125k-150k,4
1,85000,75k-100k,2
2,130000,125k-150k,4
3,175000,150k+,5
4,130000,125k-150k,4
