In [1]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from ml_utils import train_test_split_marketing,\
    fill_missing,\
    build_encoders,\
    encode_categorical,\
    build_target_encoder,\
    encode_target

df = pd.read_csv('Walmart.csv')
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [2]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split_marketing(df)
X_train.describe()

KeyError: "['y'] not found in axis"

# Missing Values

In [3]:
# Fill the missing values using the imported function
X_train_filled = fill_missing(X_train)
X_test_filled = fill_missing(X_test)
X_train_filled.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
2544,34,blue-collar,married,secondary,no,328,yes,yes,cellular,21,nov,18,7,-1.0,0,nonexistent
3891,30,technician,married,secondary,no,484,yes,no,unknown,6,may,703,1,-1.0,0,nonexistent
19464,42,entrepreneur,divorced,secondary,no,31,no,no,unknown,18,jun,120,1,-1.0,0,nonexistent
31170,27,management,single,tertiary,no,3,yes,yes,cellular,22,jul,506,1,-1.0,0,nonexistent
22216,30,management,single,secondary,no,19,no,no,cellular,26,aug,191,2,-1.0,0,nonexistent


In [4]:
# Create the encoders for categorical variables (use X_train_filled)
encoders = build_encoders(X_train_filled)
encoders

[{'column': 'job',
  'multi_col_output': True,
  'encoder': OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=5,
                sparse_output=False)},
 {'column': 'marital',
  'multi_col_output': True,
  'encoder': OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)},
 {'column': 'education',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['primary', 'secondary', 'tertiary']],
                 handle_unknown='use_encoded_value', unknown_value=-1)},
 {'column': 'default',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'housing',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'loan',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='us

In [5]:
# Encode X_train_filled and X_test_filled
X_train_encoded = encode_categorical(X_train_filled, encoders)
X_test_encoded = encode_categorical(X_test_filled, encoders)

X_train_encoded.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_management,...,x0_telephone,x0_unknown,x0_nonexistent,x0_other,x0_success,education,default,housing,loan,month
0,34,328,21,18,7,-1.0,0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,10.0
1,30,484,6,703,1,-1.0,0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0
2,42,31,18,120,1,-1.0,0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0
3,27,3,22,506,1,-1.0,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,6.0
4,30,19,26,191,2,-1.0,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0


In [6]:
# Encode y_train and y_test
y_encoder = build_target_encoder(y_train)
y_train_encoded = encode_target(y_train, y_encoder)
y_test_encoded = encode_target(y_test, y_encoder)

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=13)
model.fit(X_train_encoded, y_train_encoded)
y_pred = model.predict(X_test_encoded)
print(balanced_accuracy_score(y_test_encoded, y_pred))

0.5753911268989494


In [8]:
# Import new data and test with the model

new_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_3/datasets/bank_marketing_new_data.csv')

In [9]:
X = new_df.drop(columns='y')
y = new_df['y'].values.reshape(-1, 1)

In [10]:
X_filled = fill_missing(X)
X_encoded = encode_categorical(X_filled, encoders)
y_encoded = encode_target(y, y_encoder)

In [11]:
y_pred = model.predict(X_encoded)
print(balanced_accuracy_score(y_encoded, y_pred))

0.5769138944243607
