# Data Exploration and Cleanup
### Project 1 Group 4

##### Matthew Lombardo, Brian Klovert, Thusneem Mohamed

In [2]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [3]:
data = pd.read_csv("data/adult.csv")
data

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [4]:
len(data)

32561

In [5]:
data["occupation"].value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

In [6]:
data["workclass"].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [7]:
null_rows = data.loc[data["occupation"] == "?"]
#len(null_rows)
data_NoNa = data.drop(data.index[null_rows.index.values])
data_NoNa = data_NoNa.reset_index(drop=True)
len(data_NoNa)

30718

In [8]:
bins = [0,19,29,39,49,59,69,79,100]
labels = ["<20","20s","30s","40s","50s","60s","70s","80s+"]
data_NoNa["Age Group"] = pd.cut(data_NoNa["age"], bins, labels=labels)

In [9]:
data_NoNa["education"] = data_NoNa["education"].replace(
    {"10th":"<HS", "11th":"<HS", "12th":"<HS", "1st-4th":"<HS", "5th-6th":"<HS",
    "7th-8th":"<HS", "9th":"<HS", "Preschool":"<HS", "Assoc-acdm":"Associate",
    "Assoc-voc":"Associate"})
data_NoNa["marital.status"] = data_NoNa["marital.status"].replace(
    {"Married-AF-spouse":"Married", "Married-civ-spouse":"Married", "Married-spouse-absent":"Married"})

In [10]:
data_reduced = data_NoNa[["sex","race","age","Age Group","education","workclass","occupation",
                         "marital.status","relationship","income"]]
data_reduced.head()

Unnamed: 0,sex,race,age,Age Group,education,workclass,occupation,marital.status,relationship,income
0,Female,White,82,80s+,HS-grad,Private,Exec-managerial,Widowed,Not-in-family,<=50K
1,Female,White,54,50s,<HS,Private,Machine-op-inspct,Divorced,Unmarried,<=50K
2,Female,White,41,40s,Some-college,Private,Prof-specialty,Separated,Own-child,<=50K
3,Female,White,34,30s,HS-grad,Private,Other-service,Divorced,Unmarried,<=50K
4,Male,White,38,30s,<HS,Private,Adm-clerical,Separated,Unmarried,<=50K


In [11]:
data_final = data_reduced.rename(columns={"sex": "Sex",
                                         "race":"Race",
                                         "age":"Age",
                                         "education":"Education",
                                         "workclass":"Work Class",
                                         "occupation":"Occupation",
                                         "marital.status":"Marital Status",
                                         "relationship":"Relationship",
                                         "income":"Income"})
data_final.head()

Unnamed: 0,Sex,Race,Age,Age Group,Education,Work Class,Occupation,Marital Status,Relationship,Income
0,Female,White,82,80s+,HS-grad,Private,Exec-managerial,Widowed,Not-in-family,<=50K
1,Female,White,54,50s,<HS,Private,Machine-op-inspct,Divorced,Unmarried,<=50K
2,Female,White,41,40s,Some-college,Private,Prof-specialty,Separated,Own-child,<=50K
3,Female,White,34,30s,HS-grad,Private,Other-service,Divorced,Unmarried,<=50K
4,Male,White,38,30s,<HS,Private,Adm-clerical,Separated,Unmarried,<=50K


In [12]:
#data_final

In [13]:
data_final.to_csv("data/cleaned_project_data.csv",index=False)