In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

data = pd.read_csv('./data/insurance.csv')
X = data.drop(['charges'], axis=1).copy()
y = data[['charges']]

#print(data.info())
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, test_size=0.2, random_state=42)

# for col in X_train.select_dtypes(include='object').columns:
#     print(X_train[col].unique(), X_test[col].unique())
#     print(set(X_train[col].unique()) == set(X_test[col].unique()))
    
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = LabelEncoder().fit_transform(X_train[col])
    X_test[col] = LabelEncoder().fit_transform(X_test[col])

param = {'random_state':42}
models = {
    'xgb': XGBRegressor(n_estimators=1000, max_depth=100),
    'lasso' : Lasso(),
    'lr' : LinearRegression(),
    'rf' : RandomForestRegressor(**param),
    'ada': AdaBoostRegressor(**param),
    'bgr': GradientBoostingRegressor(**param)}

results = []
for name, model in models.items():
    model = model.fit(X_train, y_train['charges'])
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    results.append((name, model, y_pred, score))
    print('{} : {}'.format(name, score))

results.sort(key=lambda tup: tup[3], reverse=True)
result = pd.DataFrame({'charges':results[0][2]})
result.to_csv('00000.csv', index=False)