In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib


In [2]:
data = pd.read_csv('data/brazil_cities.csv', sep=';', decimal=',')

In [3]:
data.head(5)

Unnamed: 0,CITY,STATE,CAPITAL,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_RES_POP_ESTR,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,...,Pu_Bank,Pr_Assets,Pu_Assets,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART,POST_OFFICES
0,São Paulo,SP,1,11253503.0,11133776.0,119727.0,3576148.0,3548433.0,27715.0,10463636.0,...,8.0,19470770000000.0,2893261000000.0,5740995.0,1134570.0,3236.0,1.0,130.0,7.0,225.0
1,Osasco,SP,0,666740.0,664447.0,2293.0,202009.0,202009.0,,616068.0,...,2.0,6732330000000.0,13216990000.0,283641.0,73477.0,174.0,,7.0,1.0,10.0
2,Rio De Janeiro,RJ,1,6320446.0,6264915.0,55531.0,2147235.0,2147235.0,,5426838.0,...,5.0,2283445000000.0,973886400000.0,2039930.0,363486.0,289.0,1.0,68.0,1.0,120.0
3,Brasília,DF,1,2570160.0,2564370.0,5790.0,774820.0,751558.0,23261.0,2450634.0,...,4.0,292093300000.0,8016164000000.0,1288107.0,211392.0,594.0,1.0,28.0,1.0,60.0
4,Porto Alegre,RS,1,1409351.0,1403450.0,5901.0,508503.0,508503.0,,1339712.0,...,3.0,270961900000.0,402699700000.0,608777.0,106829.0,1383.0,1.0,19.0,12.0,53.0


In [4]:
data.shape

(5576, 81)

In [5]:
data.isna().sum().sort_values()

CITY                    0
STATE                   0
CAPITAL                 0
AREA                    2
COMP_D                  3
COMP_C                  3
COMP_B                  3
COMP_A                  3
COMP_TOT                3
GDP_CAPITA              3
COMP_E                  3
GDP                     3
TAXES                   3
 GVA_TOTAL              3
GVA_PUBLIC              3
GVA_MAIN                3
COMP_F                  3
COMP_H                  3
GVA_SERVICES            3
COMP_I                  3
COMP_J                  3
COMP_K                  3
COMP_L                  3
COMP_M                  3
COMP_N                  3
COMP_O                  3
COMP_P                  3
COMP_Q                  3
COMP_R                  3
COMP_S                  3
                     ... 
IBGE_1-4                8
IDHM                    8
IDHM_Renda              8
IDHM_Longevidade        8
IDHM_Educacao           8
IDHM Ranking 2010       8
LONG                    9
LAT         

In [6]:
keep = ['STATE', 'CAPITAL', 'IBGE_RES_POP', 'IDHM_Longevidade', 'IDHM_Educacao', 'GDP', 'COMP_TOT', 'CATEGORIA_TUR']

In [7]:
df = data[keep]

In [8]:
df = df.rename(columns={'STATE': 'state', 'CAPITAL': 'capital', 'IBGE_RES_POP': 'population',
                'IDHM_Longevidade': 'life_expectancy', 'IDHM_Educacao': 'education_index',
                'GDP': 'gdp', 'COMP_TOT': 'num_companies',
                'CATEGORIA_TUR': 'tourism_category',})

In [9]:
df.dtypes

state                object
capital               int64
population          float64
life_expectancy     float64
education_index     float64
gdp                 float64
num_companies       float64
tourism_category     object
dtype: object

In [10]:
df.isna().sum()

state                  0
capital                0
population             8
life_expectancy        8
education_index        8
gdp                    3
num_companies          3
tourism_category    2288
dtype: int64

In [11]:
df = df.dropna(subset=['population']) # Drop all NaN in target column

# Train Test Split

In [12]:
y = df['population']
features = [col for col in df if col != 'population']

In [13]:
X = df[features]

In [14]:
X.head()

Unnamed: 0,state,capital,life_expectancy,education_index,gdp,num_companies,tourism_category
0,SP,1,0.855,0.725,687035900.0,530446.0,A
1,SP,0,0.84,0.718,74402690.0,15315.0,B
2,RJ,1,0.845,0.719,329431400.0,190038.0,A
3,DF,1,0.873,0.742,235497100.0,86200.0,A
4,RS,1,0.857,0.702,73425.26,80082.0,A


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4454, 7)
(1114, 7)
(4454,)
(1114,)


# Classify Columns into Categorical and Numerical

In [17]:
cat_cols = ['state', 'capital', 'tourism_category']
num_cols = ['life_expectancy', 'education_index', 'gdp', 'num_companies']

# Build a Pipeline for Categorical Columns

In [18]:
cat_cols

['state', 'capital', 'tourism_category']

In [19]:
X_train_cat = X_train[cat_cols]
X_train_cat.head(3)

Unnamed: 0,state,capital,tourism_category
2285,BA,0,
4169,GO,0,
427,SP,0,A


In [20]:
X_train_cat.isna().sum()

state                  0
capital                0
tourism_category    1845
dtype: int64

In [21]:
si = SimpleImputer(strategy='constant', fill_value='other')

si.fit(X_train_cat)
X_train_cat_si = si.transform(X_train_cat)

In [22]:
ohe = OneHotEncoder(sparse=False)

X_train_cat_ohe = ohe.fit_transform(X_train_cat_si)

In [23]:
steps = [
    ('impute', si),
    ('ohe', ohe)
]

cat_pipe = Pipeline(steps=steps)
cat_pipe.fit(X_train_cat, y_train)

Pipeline(memory=None,
     steps=[('impute', SimpleImputer(copy=True, fill_value='other', missing_values=nan,
       strategy='constant', verbose=0)), ('ohe', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=False))])

# Build a Pipeline for Numerical Columns

In [24]:
num_cols

['life_expectancy', 'education_index', 'gdp', 'num_companies']

In [25]:
X_train_num = X_train[num_cols]
X_train_num.head(3)

Unnamed: 0,life_expectancy,education_index,gdp,num_companies
2285,0.775,0.512,280207.78,352.0
4169,0.814,0.583,73902.48,59.0
427,0.828,0.706,1115003.39,1974.0


In [26]:
si = SimpleImputer(strategy='mean')
ss = StandardScaler()

steps = [
    ('impute', si),
    ('ss', ss)
]

num_pipe = Pipeline(steps=steps)
num_pipe.fit(X_train_num, y_train)

Pipeline(memory=None,
     steps=[('impute', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True))])

# Use ColumnTransformer to Concatenate all Data Together

In [27]:
transformers = [
    ('cat', cat_pipe, cat_cols),
    ('num', num_pipe, num_cols)
]

ct = ColumnTransformer(transformers=transformers)

In [28]:
X_train_trans = ct.fit_transform(X_train)

In [29]:
X_train_trans.shape

(4454, 39)

# Create a Final Pipeline with Machine Learning Model

In [30]:
lr = LinearRegression()

final_steps = [
    ('transformer', ct),
    ('model', lr)
]

pipe = Pipeline(steps=final_steps)

In [31]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('transformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', Pipeline(memory=None,
     steps=[('impute', SimpleImputer(copy=True, fill_value='other', missing_values=nan,
       strategy='constant', ve...('model', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [34]:
pipe.score(X_train, y_train)

0.9669453369218541

In [32]:
pipe.score(X_test, y_test)

0.8441428165171938