# Linear regression for Census dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import pandas_profiling

In [2]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y']

In [3]:
adult_train = pd.read_csv('Census/adult.data', header = None)
adult_test  = pd.read_csv('Census/adult.test', header = None)
adult_train.columns = cols
adult_test.columns = cols
adult_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Missing values imputation

We're trying to predict missing values in `capital_gain` column using other columns.

In [4]:
# remove whitespaces from the beginning of categorical values
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [5]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
adult_test[['capital_loss']] = numeric_imputer.fit_transform(adult_test[['capital_loss']])
adult_train[['capital_loss']] = numeric_imputer.fit_transform(adult_train[['capital_loss']])

In [6]:
# categorical imputation
categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', 'native_country']])
adult_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', 'native_country']])

In [7]:
adults = adult_train.append(adult_test, sort=False)

## Transformation of categorical values to numerical ones 

In [8]:
adults = pd.get_dummies(adults, columns= ['workclass','education', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'native_country', 'y'])

## Split datasets

In [9]:
train = adults[adults['capital_gain'] != 0]
unknown = adults[adults['capital_gain'] == 0]

In [10]:
y_train = train['capital_gain']
x_train = train.loc[:, train.columns != 'capital_gain']
y_unknown = unknown['capital_gain']
x_unknown = unknown.loc[:, unknown.columns != 'capital_gain']

## Train model

In [11]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

## Predict missing values

In [12]:
y_unknown = model.predict(x_unknown)

In [13]:
unknown['capital_gain'] = y_unknown

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
adults_df = train.append(unknown, ignore_index=True)

In [15]:
# replace negarive values with zero

mask = adults_df['capital_gain'] < 0

adults_df.loc[mask, 'capital_gain'] = 0