In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('melb_data.csv')
y = data.Price
x = data.drop(['Price'], axis=1)

x_train_full, x_test_full, y_train_full, y_test_full = train_test_split(x, y, random_state=0, train_size=0.8, test_size=0.2)

numerical_cols = [cname for cname in x_train_full.columns if x_train_full[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in x_train_full.columns if x_train_full[cname].nunique() < 10 and x_train_full[cname].dtype == 'object']

my_cols = numerical_cols + categorical_cols
x_train = x_train_full[my_cols].copy()
x_test = x_test_full[my_cols].copy()

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [4]:
my_pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])

In [5]:
my_pipeline.fit(x_train, y_train_full)
preds = my_pipeline.predict(x_test)

mean_absolute_error(y_test_full, preds)

160679.18917034855

In [6]:
import pandas as pd

data = pd.read_csv('melb_data.csv')
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']

x=data[cols_to_use]
y=data.Price

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps=[
    ('imput', SimpleImputer()),
    ('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

In [8]:
from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(my_pipeline, x, y, cv=5, scoring='neg_mean_absolute_error')

print(scores)

[301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('melb_data.csv')
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']

x=data[cols_to_use]
y=data.Price
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [10]:
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=500, early_stopping_rounds=5)
my_model.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test)],
    verbose=False
)

In [11]:
from sklearn.metrics import mean_absolute_error

preds = my_model.predict(x_test)
mean_absolute_error(preds, y_test)

248263.85057529455

In [12]:
my_model = XGBRegressor(
    n_estimators=1000,
    early_stopping_rounds=5,
    learning_rate=0.05,
    n_jobs=12
)

my_model.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test)],
    verbose=False
)

# AER credit dataset

In [13]:
import pandas as pd

data = pd.read_csv('AER_credit_card_data.csv', true_values=['yes'], false_values=['no'])

In [14]:
data.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,True,0,37.66667,4.52,0.03327,124.9833,True,False,3,54,1,12
1,True,0,33.25,2.42,0.005217,9.854167,False,False,3,34,1,13
2,True,0,33.66667,4.5,0.004156,15.0,True,False,4,58,1,5
3,True,0,30.5,2.54,0.065214,137.8692,False,False,0,25,1,7
4,True,0,32.16667,9.7867,0.067051,546.5033,True,False,2,64,1,5


In [15]:
y = data.card
x = data.drop(['card'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [16]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   reports      1319 non-null   int64  
 1   age          1319 non-null   float64
 2   income       1319 non-null   float64
 3   share        1319 non-null   float64
 4   expenditure  1319 non-null   float64
 5   owner        1319 non-null   bool   
 6   selfemp      1319 non-null   bool   
 7   dependents   1319 non-null   int64  
 8   months       1319 non-null   int64  
 9   majorcards   1319 non-null   int64  
 10  active       1319 non-null   int64  
dtypes: bool(2), float64(4), int64(5)
memory usage: 95.4 KB


In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, x, y, cv=5, scoring='accuracy')

In [18]:
cv_scores.mean()

np.float64(0.9780159004493605)

In [19]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   reports      1319 non-null   int64  
 1   age          1319 non-null   float64
 2   income       1319 non-null   float64
 3   share        1319 non-null   float64
 4   expenditure  1319 non-null   float64
 5   owner        1319 non-null   bool   
 6   selfemp      1319 non-null   bool   
 7   dependents   1319 non-null   int64  
 8   months       1319 non-null   int64  
 9   majorcards   1319 non-null   int64  
 10  active       1319 non-null   int64  
dtypes: bool(2), float64(4), int64(5)
memory usage: 95.4 KB


In [20]:
expenditure_cardholders = x.expenditure[y]
expenditure_noncardholders = x.expenditure[~y]

(expenditure_noncardholders == 0).mean()

np.float64(1.0)

In [21]:
(expenditure_cardholders == 0).mean()

np.float64(0.020527859237536656)

In [22]:
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']

In [44]:
x2 = x.drop(potential_leaks, axis=1)
cv_scores = cross_val_score(my_pipeline, x2, y, cv=5, scoring='accuracy')
cv_scores.mean()

np.float64(0.8309223412835582)