## Table of Contents:

### 1. Importing libraries
### 2. Loading Data
### 3. Exploratory Data Analysis 
### 4. Feature Engineering
### 5. Model build
### 6. Model Evaluation

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
col_names = ['Age','WorkClass','Fnlwgt','Education','Education_num','Marital_Status','Occupation',
                 'Relationship','Race','Sex','Capital_gain','Capital_loss','hours_per_week','native_country','Income_grp']

adult = pd.read_csv('/users/johnstonkirimo/projects/weekly_analysis/Week5/data/adult.data', names=col_names, header=None)

In [3]:
adult.head()

Unnamed: 0,Age,WorkClass,Fnlwgt,Education,Education_num,Marital_Status,Occupation,Relationship,Race,Sex,Capital_gain,Capital_loss,hours_per_week,native_country,Income_grp
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#rows and columns

adult.shape

(32561, 15)

In [5]:
#check for missing values

adult.isna().sum()  

Age               0
WorkClass         0
Fnlwgt            0
Education         0
Education_num     0
Marital_Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital_gain      0
Capital_loss      0
hours_per_week    0
native_country    0
Income_grp        0
dtype: int64

In [6]:
#high level statistical summary

adult.describe()

Unnamed: 0,Age,Fnlwgt,Education_num,Capital_gain,Capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
#Get column datatypes

adult.dtypes

Age                int64
WorkClass         object
Fnlwgt             int64
Education         object
Education_num      int64
Marital_Status    object
Occupation        object
Relationship      object
Race              object
Sex               object
Capital_gain       int64
Capital_loss       int64
hours_per_week     int64
native_country    object
Income_grp        object
dtype: object

In [8]:
#check memory usage

adult.memory_usage(deep=True)

Index                 128
Age                260488
WorkClass         2144611
Fnlwgt             260488
Education         2163148
Education_num      260488
Marital_Status    2357874
Occupation        2285844
Relationship      2185486
Race              2068893
Sex               2040324
Capital_gain       260488
Capital_loss       260488
hours_per_week     260488
native_country    2288838
Income_grp        2043502
dtype: int64

In [12]:
#Get average age by sex

adult.groupby('Sex')['Age'].mean()

Sex
 Female    36.858230
 Male      39.433547
Name: Age, dtype: float64

In [13]:
#Average years of education by sex

adult.groupby('Sex')['Education_num'].mean()

Sex
 Female    10.035744
 Male      10.102891
Name: Education_num, dtype: float64

In [14]:
#Avg years of education by sex and race

ed_years = adult.groupby(['Sex','Race']).Education_num.mean().reset_index(name='school_yrs')
ed_years

Unnamed: 0,Sex,Race,school_yrs
0,Female,Amer-Indian-Eskimo,9.697479
1,Female,Asian-Pac-Islander,10.390173
2,Female,Black,9.549839
3,Female,Other,8.899083
4,Female,White,10.12798
5,Male,Amer-Indian-Eskimo,9.072917
6,Male,Asian-Pac-Islander,11.24531
7,Male,Black,9.423199
8,Male,Other,8.802469
9,Male,White,10.138521


In [15]:
ed_years_race = pd.pivot_table(ed_years,index=['Race'],values=['school_yrs'], columns=['Sex'])
print(ed_years_race)

                    school_yrs           
Sex                     Female       Male
Race                                     
 Amer-Indian-Eskimo   9.697479   9.072917
 Asian-Pac-Islander  10.390173  11.245310
 Black                9.549839   9.423199
 Other                8.899083   8.802469
 White               10.127980  10.138521


In [21]:
#Get % distribution of peoples' native country 

adult.native_country.value_counts(normalize=True)

 United-States                 0.895857
 Mexico                        0.019748
 ?                             0.017905
 Philippines                   0.006081
 Germany                       0.004207
 Canada                        0.003716
 Puerto-Rico                   0.003501
 El-Salvador                   0.003255
 India                         0.003071
 Cuba                          0.002918
 England                       0.002764
 Jamaica                       0.002488
 South                         0.002457
 China                         0.002303
 Italy                         0.002242
 Dominican-Republic            0.002150
 Vietnam                       0.002058
 Guatemala                     0.001966
 Japan                         0.001904
 Poland                        0.001843
 Columbia                      0.001812
 Taiwan                        0.001566
 Haiti                         0.001351
 Iran                          0.001321
 Portugal                      0.001136


### Next Steps:
  - Complete Script:
    - Continue with exploratory data analysis
    - Feature engineering
    - model build
    - model testing/evaluation