# Workshop 1 - Data Preprocessing

### The objective of the workshop is to learn how to manipulate data, and how to apply common data pre-processing techniques in order to improve model performance without changing the algorithm used.

#### We will be using the famous "Adult" dataset from the public UCI repository. The dataset consists of information on various individuals, including age, education, marital status, gender, and income.

In [1]:
# Bread and butter libraries to deal with dataframes and matrices
import numpy as np
import pandas as pd

##### Machine learning models cannot deal with null values, fill out while opening CSV file as a dataframe

In [23]:
na_values = ['#NAME?']
path = 'Datasets/'
df = pd.read_csv(path + 'adult.csv', na_values=na_values)

In [24]:
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


#### Problem: Predict, based on various features, if someone's income is greater or less than 50k

In [25]:
# Observe dataset distribution, in practice when unbalanced datasets are not well treated, performance metrics can
# be very misleading
df['income'].value_counts()

<=50K    3779
>50K     1221
Name: income, dtype: int64

In [26]:
# Encode as 0 if income <=50K and as 1 if income >50K

df['income'] = [0 if x == '<=50K' else 1 for x in df['income']]

In [27]:
df.income.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    1
8    1
9    1
Name: income, dtype: int64

### Benchmark Performance with unprocessed data

In [28]:
# Drop na values to not throw any errors
df_unprocessed = df
df_unprocessed = df_unprocessed.dropna(axis=0, how='any')
print(df.shape)
print(df_unprocessed.shape)

(5000, 15)
(4496, 15)


In [29]:
# Remove non-numeric columns so model doesn't throw errors
# Potential loss of information from categorical features is evident

for col_name in df_unprocessed.columns:
    if df_unprocessed[col_name].dtypes not in ['int32','int64','float32','float64']:
        df_unprocessed = df_unprocessed.drop(col_name, 1)

In [30]:
# Split into features and outcomes
X_unprocessed = df_unprocessed.drop('income', 1)
y_unprocessed = df_unprocessed.income

In [31]:
X_unprocessed.head(5)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,39.0,77516.0,13.0,2174,0,40
1,50.0,83311.0,13.0,0,0,13
2,38.0,215646.0,9.0,0,0,40
4,28.0,338409.0,13.0,0,0,40
5,37.0,284582.0,14.0,0,0,40


#### Import classif. algo to measure baseline accuracy

In [35]:
# Import common ML tools from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Split data into train/test split

In [34]:
X_train_unproc, X_test_unproc, y_train, y_test = train_test_split(
    X_unprocessed, y_unprocessed, train_size=0.70)



In [36]:
# Function that returns model accuracy
def find_model_perf(X_train, y_train, X_test, y_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)

    
    return acc

In [38]:
acc_unprocessed= find_model_perf(X_train_unproc, y_train, X_test_unproc, y_test)

In [39]:
acc_unprocessed

0.78799110452186805

### NOTE: if train features are unprocessed/ processed, test features must be of the same format

### Now, apply some data pre-processing on the dataset, then feed processed data into model

In [None]:
# Assign outcome as 0 if income <=50K and as 1 if income >50K.
# Very simple form of encoding categorical data.

df['income'] = [0 if x == '<=50K' else 1 for x in df['income']]

# Split dataframe into features and target variable.

y = df.income
X = df.drop(['income'], 1)



In [None]:
y.head(10)

In [None]:
X.head(10)

## Clean Data

In [None]:

# Use get_dummies in pandas # Use ge 
# Another option: OneHotEncoder in sci-kit learn
print(pd.get_dummies(X['education']).head(5))

In [None]:

# Decide which categorical variables you want to use in model 
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))

In [None]:
# Although, 'native_country' has a lot of unique categories, most categories only have a few observations
print(X['native_country'].value_counts().sort_values(ascending=False).head(10))

In [None]:
# In this case, bucket low frequecy categories as "Other"
X['native_country'] = ['United-States ' if x == 'United-States' else 'Other' for x in X['native_country']]

print(X['native_country'].value_counts().sort_values(ascending=False))

In [None]:

# Create a list of features to dummy# Create 
todummy_list = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']