## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import math
import os

### Read data into DFs

In [None]:
headers = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','pred']

In [None]:
original_train_df = pd.read_csv("census-income.data.csv", low_memory = False, names = headers, index_col = False)

In [None]:
original_test_df = pd.read_csv("census-income.test.csv", low_memory = False, names = headers, index_col = False)

In [None]:
original_train_df.isna().sum()

In [None]:
original_train_df.isnull().sum()

In [None]:
train_df = original_train_df.copy()

In [None]:
test_df = original_test_df.copy()

In [None]:
train_df[:25]

## Cleanup:

# To Do's:
* Check for duplicates
* Find missing values
    * I've found the "?" missing values so far
* Deal with missing values
* Test for imbalanced data
* Deal with imbalanced data
* Convert categorical values into a Category datatype

### Remove whitespace

In [None]:
# show example value

train_df.iloc[0]['occupation']

In [None]:
def data_to_str(df):

# Iterate through dataframe, test datatype, append string-type columns into our list,
# use list to strip whitespace from string values in columns
    
    to_string_list = [] # create empty list to fill with string-dtype column names
    
    for col in df: # iterate through all columns in train_df
        if pd.api.types.is_object_dtype(df[col]): # test if string-datatype
            to_string_list.append(col) # if true, append to list
    
    for col in to_string_list: # iterate through list of string-datatype columns
        df[col] = df[col].str.strip() # strip all values in column of whitespace

In [None]:
data_to_str(train_df)

In [None]:
# test example value

train_df.iloc[0]['occupation']

### Explicitly declare int32 datatype

In [None]:
# The integers are stored in int64 - not wrong, but unnecessarily large for these values

train_df.iloc[0]['age']

In [None]:
def data_to_int32(df):

# Iterate through dataframe, test datatype, append numeric-type columns into our list,
# use list to strip whitespace from numeric values in columns

    to_int32_list = [] # create empty list to fill with numeric-dtype column names
    
    for col in df: # iterate through all columns in train_df
        if pd.api.types.is_numeric_dtype(df[col]): # test if string-datatype
            to_int32_list.append(col) # if true, append to list
    
    for col in to_int32_list: # iterate through list of string-datatype columns
        df[col] = df[col].astype('int32') # strip all values in column of whitespace

In [None]:
data_to_int32(train_df)

In [None]:
train_df.iloc[0]['age']

In [None]:
train_df.dtypes

In [None]:
train_df.replace("?", pd.NA, inplace=True)

In [None]:
train_df[train_df.isna().any(axis=1)]

In [None]:
train_df['native-country'].value_counts(dropna=False)

In [None]:
bar_charts = ['sex','race','workclass','marital-status','occupation','relationship','education','education-num','native-country']

In [None]:
# Number of columns & rows
cols = 3
rows = 3

fig, axes = plt.subplots(rows, cols, figsize=(10, 4 * rows))

axes = axes.flatten()   # flatten array of axes for easy indexing

for i, column in enumerate(bar_charts):
    train_df[column].value_counts(dropna=False).plot(kind='bar', ax=axes[i])
    axes[i].set_title(f"value_counts of {column}")
    axes[i].set_xlabel(column)
    axes[i].set_ylabel("Count")

# Hide any unused subplots
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
hist_charts = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']

In [None]:
# Number of columns & rows
cols = 2
rows = 3

fig, axes = plt.subplots(rows, cols, figsize=(10, 4 * rows))

axes = axes.flatten()   # flatten array of axes for easy indexing

for i, column in enumerate(hist_charts):
    train_df[column].plot(kind='hist', ax=axes[i])
    axes[i].set_title(f"value_counts of {column}")
    axes[i].set_xlabel(column)
    axes[i].set_ylabel("Count")

# Hide any unused subplots
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
train_df.drop('pred', axis=1)

In [None]:
'''
corr_matrix = train_df.drop('pred', axis=1).corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
fig, ax = plt.subplots()
heatmap = sns.heatmap(corr_matrix, cmap='RdBu', linewidths=1, fmt=".1%", annot_kws={"color": "darkgrey", "size": 12}, mask=mask)
plt.figure(figsize=(15,15))
ax.set_title('Correlation Matrix')
plt.show()
'''

In [None]:
train_df.value_counts(['race','sex'])

In [None]:
train_df.value_counts(['occupation'])