In [1]:
import pandas as pd
import ydata_profiling

In [2]:
data = pd.read_csv("data/census.csv")

#### Checking the dataframe (census data)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlgt           32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


#### Checking the data

In [4]:
data.head(15)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


#### Census dataframe overview

In [5]:
data.describe()

Unnamed: 0,age,fnlgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


#### Generate the Census data set report

In [6]:
profile = ydata_profiling.ProfileReport(data)
# Storing report in HMTL
profile.to_file(output_file='screenshots/eda_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Data cleaning (removing all spaces).

In [7]:
col_names = [col.strip() for col in data.columns]
data.columns = col_names

In [8]:
data.columns

Index(['age', 'workclass', 'fnlgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

#### Removing all spaces in categorical features.

In [9]:
categorical_features = [
                'workclass', 
                'education', 
                'marital-status', 
                'occupation', 
                'relationship', 
                'race', 
                'sex', 
                'native-country'
                ]
for cat_feature in categorical_features:
    data[cat_feature] = data[cat_feature].str.strip()
    print(f"{cat_feature}:\n {data[cat_feature].unique()}")
    

workclass:
 ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
education:
 ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
marital-status:
 ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
occupation:
 ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
relationship:
 ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
race:
 ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
sex:
 ['Male' 'Female']
native-country:
 ['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'En

In [10]:
list(data['education'])

['Bachelors',
 'Bachelors',
 'HS-grad',
 '11th',
 'Bachelors',
 'Masters',
 '9th',
 'HS-grad',
 'Masters',
 'Bachelors',
 'Some-college',
 'Bachelors',
 'Bachelors',
 'Assoc-acdm',
 'Assoc-voc',
 '7th-8th',
 'HS-grad',
 'HS-grad',
 '11th',
 'Masters',
 'Doctorate',
 'HS-grad',
 '9th',
 '11th',
 'HS-grad',
 'Bachelors',
 'HS-grad',
 'Some-college',
 'HS-grad',
 'HS-grad',
 'Assoc-acdm',
 'Some-college',
 'Bachelors',
 'Some-college',
 'Some-college',
 '11th',
 'Some-college',
 'HS-grad',
 'Some-college',
 'Assoc-acdm',
 '9th',
 'Bachelors',
 'Bachelors',
 'HS-grad',
 'HS-grad',
 'Bachelors',
 'HS-grad',
 'Masters',
 'Assoc-voc',
 'Assoc-voc',
 'Some-college',
 'HS-grad',
 'Prof-school',
 'Bachelors',
 'HS-grad',
 'Some-college',
 '5th-6th',
 'Assoc-voc',
 'HS-grad',
 'HS-grad',
 'Bachelors',
 '7th-8th',
 'HS-grad',
 'Doctorate',
 'Some-college',
 'HS-grad',
 'Some-college',
 'HS-grad',
 'Some-college',
 'Some-college',
 'Some-college',
 'Bachelors',
 'Bachelors',
 'Some-college',
 'Some

In [12]:
data.to_csv('data/cleaned_census.csv', index=False)