In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
import warnings
warnings.filterwarnings('ignore')

from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error,classification_report,roc_auc_score,roc_curve
from sklearn.model_selection import learning_curve, validation_curve
from scipy.stats import randint,zscore,skew,kurtosis
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

### Import Data

In [9]:
def data_import():
    df_dish = pd.read_csv('dish.csv')
    df_dish1 = df_dish[(df_dish['price'] > 0) & (df_dish['price'] < 100)]
    df_order = pd.read_csv('order.csv')
    kitchen_orders = df_order.groupby('kitchen_id').size().reset_index(name='kitchen_orders')
    df_dish2 = pd.merge(df_dish1, kitchen_orders, on='kitchen_id', how='left')
    df_dish3 = df_dish2.drop(columns =['index','dish_id','kitchen_id','name','tags','description','check_time','refusal_ground','create_time',\
                                   'update_time','speech_time','dish_feature','materials','net_price'], axis=1).fillna(0)
    return df_dish3


### Setup Models

In [15]:
def split_data(df):
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    X_train = train.drop(['price'], axis=1)
    y_train = train['price']

    X_test = test.drop(['price'], axis=1)
    y_test = test['price'] 
    return X_train, y_train, X_test, y_test

In [16]:
def evaluation(x,y,model):
    price_predictions = model.predict(x)
    mse = mean_squared_error(y, price_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, price_predictions)
    r2 = r2_score(y, price_predictions)
    return 'RMSE:{}'.format(rmse), 'MAE:{}'.format(mae),'R-square:{}'.format(r2)

In [29]:
def fit_model(X_train,y_train, X_test, y_test):
    pca = PCA(n_components=17)
    
    fit_train = pca.fit(X_train)
    reduced_features = pca.transform(X_train)
    reduced_features = pd.DataFrame(reduced_features)
    forest_reg = RandomForestRegressor(random_state=42,max_features=12, n_estimators=100,max_depth=10).fit(X_train, y_train)
    
    X_train = reduced_features
    forest_reg.fit(X_train, y_train)
    print('in-sample evaluation:{}'.format(evaluation(X_train, y_train, forest_reg)))
    
    fit_test = pca.fit(X_test)
    reduced_features_test = pca.transform(X_test)
    reduced_features_test = pd.DataFrame(reduced_features_test)
    X_test = reduced_features_test
    forest_reg.fit(X_test, y_test)
    print('out-sample evaluation:{}'.format(evaluation(X_test, y_test, forest_reg)))
    

In [30]:
df_dish = data_import()
X_train, y_train, X_test, y_test = split_data(df_dish)

In [31]:
fit_model(X_train,y_train, X_test, y_test)

in-sample evaluation:('RMSE:9.026749938201245', 'MAE:6.007056088815333', 'R-square:0.23116644642595197')
out-sample evaluation:('RMSE:8.80354058768713', 'MAE:5.901518737961149', 'R-square:0.2707428371266908')
