### Refer to the price of dishes with exactly the same names

In [13]:
def get_data(price_bound=100):
    df = pd.read_csv('./data/dish_info.csv')
    df.drop('index', inplace=True,axis=1)
    
    # get rid of outliers
    df = df[pd.notnull(df['name'])]
    df = df[df['price'] <= price_bound]
    
    # get rid of invalid names
    df['name'] = df['name'].apply(process_names)
    df = df[df['name'].str.len() != 0]
    df = df[~df['name'].str.isnumeric()]
    
#     # tokenize the names of the dishes
#     start = time.time()
#     df['name'] = df['name'].apply(tokenize)
#     print('used {:0.2f} seconds to tokenize the names'.format(time.time() - start))
    
    # split the dataframe into training and testing
    train, test = train_test_split(df, test_size=0.2, random_state=0)
    
    return train, test

In [14]:
train, test = get_data()

In [15]:
train_price_df = train.groupby('kitchen_id').price.mean().reset_index()

In [18]:
predictions = pd.merge(train[['kitchen_id']], train_price_df, on='kitchen_id', how='left').fillna(train.price.mean())['price']

In [19]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(train['price'], predictions)
rmse = np.sqrt(mse)

In [20]:
print(rmse)

9.23826442980105


In [24]:
train['pred'] = predictions
train[['name', 'price', 'pred']]

Unnamed: 0,name,price,pred
188538,爆炒肥肠,25,20.063380
126590,韩式辣炒年糕,20,19.583333
278352,米饭,1,20.400000
254028,番茄炒有机菜花,15,25.071895
273036,鲜虾豆腐煲,22,18.750000
239941,土豆烧牛肉,35,17.222222
234548,蒜蓉炒西兰花,18,20.181818
36343,双拼红烧鸡翅配香菇菜心,20,21.125000
147634,可乐鸡翅,25,19.964286
197916,芹菜香干炒肉,20,20.692982


In [53]:
train_price_df = train.groupby('name').price.mean().reset_index()

In [54]:
predictions = pd.merge(train[['name']], train_price_df, on='name', how='left').fillna(train.price.mean())['price']

In [55]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(train['price'], predictions)
rmse = np.sqrt(mse)

In [56]:
print(rmse)

4.169272572961129


### Refer to the mean price of every tokens

In [3]:
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
import pandas as pd
import numpy as np

In [5]:
import jieba
import jieba.posseg as pseg

In [8]:
def tokenize(name_string, only_noun=True):
    w_pair = pseg.cut(str(name_string))
    tokens = [(word, tag) for word, tag in w_pair]
    if only_noun:
            tokens = [wt[0] for wt in tokens if wt[1].startswith('n')]
    return tokens

In [9]:
def process_names(name):
    return ''.join([i for i in name if i.isalpha()])

In [10]:
def build_vocab(names, min_freq=10):
    vocab = {}
    for tokens in names:
        for t in tokens:
            if t in vocab:
                vocab[t] += 1
            else:
                vocab[t] = 1

    keys = list(vocab.keys())

    for k in keys:
        if len(k) < 2:
            del vocab[k]

        if k in vocab:
            if vocab[k] < min_freq:
                del [vocab[k]]
                
    return vocab

In [11]:
def build_price_dict(df, vocab):
    price_dict = {}
    for index, row in df.iterrows():
        for t in row['name']:
            if t in vocab:
                price_dict[t] = price_dict.get(t, 0) + row['price']
                
    return price_dict

In [12]:
def get_data(price_bound=100):
    df = pd.read_csv('./data/dish_info.csv')
    df.drop('index', inplace=True,axis=1)
    
    # get rid of outliers
    df = df[pd.notnull(df['name'])]
    df = df[df['price'] <= price_bound]
    
    # get rid of invalid names
    df['name'] = df['name'].apply(process_names)
    df = df[df['name'].str.len() != 0]
    df = df[~df['name'].str.isnumeric()]
    
    # tokenize the names of the dishes
    start = time.time()
    df['name'] = df['name'].apply(tokenize)
    print('used {:0.2f} seconds to tokenize the names'.format(time.time() - start))
    
    # split the dataframe into training and testing
    train, test = train_test_split(df, test_size=0.2, random_state=0)
    
    # build vocab using the dish names in train
    names = list(train['name'])
    vocab = build_vocab(names, min_freq=5)
    
    # get the summation of price for all tokens
    start = time.time()
    price = build_price_dict(train, vocab)
    print('used {:0.2f} seconds to get tokens prices'.format(time.time() - start))
    
    # build reference dataframe
    df_outcome = pd.DataFrame({'name':list(vocab.keys()), 'count': list(vocab.values()), 'price_sum':list(price.values())})
    df_outcome['mean_price'] = df_outcome['price_sum'] / df_outcome['count']
    
    return train, test, df_outcome

In [71]:
train, test, df_outcome = get_data()
df_outcome.head()

used 112.84 seconds to tokenize the names
used 29.48 seconds to get tokens prices


Unnamed: 0,name,count,price_sum,mean_price
0,肥肠,620,18402,29.680645
1,韩式辣,39,924,23.692308
2,炒年糕,369,8040,21.788618
3,米饭,4620,19218,4.15974
4,番茄,3938,74148,18.828847


In [72]:
token_price_dict = dict(zip(df_outcome.name, df_outcome.mean_price))

In [73]:
def get_price(row):
    price_list = [token_price_dict[t] for t in row if t in token_price_dict]
    if len(price_list) == 0:
        return 0
    else:
        return sum(price_list) / len(price_list)

In [74]:
price_predictions = test['name'].apply(get_price)

In [75]:
price_predictions[price_predictions == 0] = train.price.mean()

In [76]:

mse = mean_squared_error(test['price'], price_predictions)
rmse = np.sqrt(mse)

In [77]:
print(rmse)

8.133305770399332


In [78]:
test['pred'] = price_predictions

In [83]:
res_df = test[(price_predictions - test['price']) ** 2 > 64][['name', 'price', 'pred']]

In [91]:
res_df[res_df.name.apply(lambda x: x == ['炸鸡'])]

Unnamed: 0,name,price,pred
182506,[炸鸡],12,31.995215
162508,[炸鸡],23,31.995215
88730,[炸鸡],10,31.995215
80467,[炸鸡],22,31.995215
118849,[炸鸡],13,31.995215
331238,[炸鸡],22,31.995215
293542,[炸鸡],22,31.995215


In [93]:
df_outcome.head(10)

Unnamed: 0,name,count,price_sum,mean_price
0,肥肠,620,18402,29.680645
1,韩式辣,39,924,23.692308
2,炒年糕,369,8040,21.788618
3,米饭,4620,19218,4.15974
4,番茄,3938,74148,18.828847
5,有机,1665,32220,19.351351
6,菜花,2193,42627,19.437756
7,鲜虾,293,8980,30.648464
8,豆腐,9208,170738,18.542354
9,土豆,7541,182007,24.135658


### Kitchen matching

### kitchen mean

In [6]:
def get_data(price_bound=100):
    df = pd.read_csv('./data/dish_info.csv')
    df.drop('index', inplace=True,axis=1)
    
    # get rid of outliers
    df = df[pd.notnull(df['name'])]
    df = df[df['price'] <= price_bound]
    
    # get rid of invalid names
    df['name'] = df['name'].apply(process_names)
    df = df[df['name'].str.len() != 0]
    df = df[~df['name'].str.isnumeric()]
    
#     # tokenize the names of the dishes
#     start = time.time()
#     df['name'] = df['name'].apply(tokenize)
#     print('used {:0.2f} seconds to tokenize the names'.format(time.time() - start))
    
    # split the dataframe into training and testing
    train, test = train_test_split(df, test_size=0.2, random_state=0)
    
    return train, test

In [7]:
train, test = get_data()

In [106]:
kitchen_mean = train.groupby('kitchen_id')['price'].mean().reset_index().rename(columns={'price':'kitchen_mean'})

In [116]:
temp = pd.merge(train, kitchen_mean, on='kitchen_id', how='left')

In [117]:
temp[['price', 'kitchen_mean']].head()

Unnamed: 0,price,kitchen_mean
0,25,28.71875
1,20,20.2
2,1,18.925926
3,15,20.272727
4,22,18.692308


In [118]:
temp.kitchen_mean.fillna(train.price.mean(), inplace=True)

In [119]:
mse = mean_squared_error(train['price'], temp['kitchen_mean'])
rmse = np.sqrt(mse)

In [120]:
print(rmse)

9.23826442980105


### other kitchen info with kitchen mean

In [15]:
train[train.is_recommend == 0]['price'].mean()

20.642566530194472

In [16]:
train[train.is_recommend == 1]['price'].mean()

23.924027026639315

In [18]:
kitchen_df = pd.read_csv('./data/kitchen_info.csv')

In [21]:
kitchen_df['city_id'].value_counts()

110100    10718
310100     2971
440300     1600
440100      683
330100      587
420100        2
Name: city_id, dtype: int64

In [22]:
kitchen_df['staple_price'].value_counts()

1    12621
2     2731
3     1128
5       59
4       22
Name: staple_price, dtype: int64

In [23]:
kitchen_df['like_num'].describe()

count    16561.000000
mean        14.233440
std         33.504996
min          0.000000
25%          0.000000
50%          1.000000
75%         14.000000
max        784.000000
Name: like_num, dtype: float64

In [37]:
kitchen = kitchen_df[['kitchen_id', 'city_id', 'staple_price', 'like_num']]

In [31]:
kitchen_mean = train.groupby('kitchen_id')['price'].mean().reset_index().rename(columns={'price':'kitchen_mean'})

In [32]:
train_k = pd.merge(train, kitchen_mean, on='kitchen_id', how='left')

In [33]:
test_k = pd.merge(test, kitchen_mean, on='kitchen_id', how='left')

In [44]:
temp_train = pd.merge(train_k, kitchen, on='kitchen_id', how='left')[['kitchen_id', 'kitchen_mean', 'city_id', 'staple_price_x', 'staple_price_y', 'like_num', ]]

In [45]:
temp_test = pd.merge(test_k, kitchen, on='kitchen_id', how='left')[['kitchen_id', 'kitchen_mean', 'city_id', 'staple_price_x', 'staple_price_y', 'like_num', ]]

In [48]:
temp_train.drop('staple_price_x', axis=1, inplace=True)

In [47]:
temp_test.drop('staple_price_x', axis=1, inplace=True)

In [76]:
temp_test.kitchen_mean.fillna(train.price.mean(), inplace=True)

In [100]:
np.array(temp_train['kitchen_mean']).reshape(-1, 1)

array([[28.71875   ],
       [20.2       ],
       [18.92592593],
       ...,
       [20.11764706],
       [28.89655172],
       [20.55714286]])

In [110]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=6,
                             random_state=0)
clf.fit(np.array(temp_train['kitchen_mean']).reshape(-1, 1), train_k['price'])

predictions = clf.predict(np.array(temp_test['kitchen_mean']).reshape(-1, 1))

In [113]:
mse = mean_squared_error(clf.predict(np.array(temp_train['kitchen_mean']).reshape(-1, 1)), train['price'])
rmse = np.sqrt(mse)

In [114]:
print(rmse)

9.9201694387853


In [124]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.naive_bayes import MultinomialNB

# reg = LinearRegression().fit(temp_train, train['price'])
reg = Lasso(alpha=0.3).fit(temp_train, train['price'])
# reg = MultinomialNB().fit(temp_train, train['price'])

predictions = reg.predict(temp_test)

mse = mean_squared_error(predictions, test['price'])
rmse = np.sqrt(mse)

In [125]:
print(rmse)

9.851364515957638


In [194]:
df = pd.read_csv('./data/dish_info.csv')
df.drop('index', inplace=True,axis=1)

# get rid of outliers
df = df[pd.notnull(df['name'])]
df = df[df['price'] <= 100]

# get rid of invalid names
df['name'] = df['name'].apply(process_names)
df = df[df['name'].str.len() != 0]
df = df[~df['name'].str.isnumeric()]

train, test = train_test_split(df, test_size=0.2, random_state=0)

temp = train.groupby('kitchen_id')['price'].mean().reset_index()

kitchen_status = pd.merge(kitchen_df, temp, on='kitchen_id', how='left')[['kitchen_id', 'city_id', 'staple_price', 'like_num', 'price']]

In [195]:
len(kitchen_status)

16561

In [196]:
kitchen_status.dropna(inplace=True)
len(kitchen_status)

16508

In [197]:
kitchen_status.groupby('staple_price')['price'].mean()

staple_price
1    21.143676
2    21.643201
3    22.927295
4    24.030994
5    28.202615
Name: price, dtype: float64

In [198]:
kitchen_status.groupby('city_id')['price'].mean()

city_id
110100    21.692270
310100    20.842142
330100    19.711258
420100     9.333333
440100    20.781189
440300    21.134903
Name: price, dtype: float64

In [200]:
kitchen_mean = train.groupby('kitchen_id')['price'].mean().reset_index().rename(columns={'price':'kitchen_mean'})

In [201]:
df_new = pd.merge(kitchen_mean, kitchen_df, on='kitchen_id', how='left')[['kitchen_id', 'city_id', 'staple_price', 'kitchen_mean']]

In [202]:
df_new.kitchen_mean.mean()

21.376152468888094

In [203]:
df_new.groupby('staple_price')['kitchen_mean'].mean() - df_new.kitchen_mean.mean()

staple_price
1   -0.232477
2    0.267048
3    1.551142
4    2.654841
5    6.826462
Name: kitchen_mean, dtype: float64

In [204]:
df_new.groupby('city_id')['kitchen_mean'].mean() - df_new.kitchen_mean.mean()

city_id
110100     0.316118
310100    -0.534011
330100    -1.664895
420100   -12.042819
440100    -0.594963
440300    -0.241249
Name: kitchen_mean, dtype: float64

In [205]:
df_new.loc[df_new['city_id'] == 110100, ['kitchen_mean']] += 0.316118
df_new.loc[df_new['city_id'] == 310100, ['kitchen_mean']] -= 0.534011
df_new.loc[df_new['city_id'] == 330100, ['kitchen_mean']] -= 1.664895
df_new.loc[df_new['city_id'] == 420100, ['kitchen_mean']] -= 12.042819
df_new.loc[df_new['city_id'] == 440100, ['kitchen_mean']] -= 0.594963
df_new.loc[df_new['city_id'] == 440300, ['kitchen_mean']] -= 0.241249

In [206]:
price_modified_by_city_only = df_new.kitchen_mean

In [212]:
df_new.loc[df_new['staple_price'] == 1, ['kitchen_mean']] -= 0.232477
df_new.loc[df_new['staple_price'] == 2, ['kitchen_mean']] -= 0.267048
df_new.loc[df_new['staple_price'] == 3, ['kitchen_mean']] -= 1.551142
df_new.loc[df_new['staple_price'] == 4, ['kitchen_mean']] -= 2.654841
df_new.loc[df_new['staple_price'] == 5, ['kitchen_mean']] -= 6.826462

In [213]:
price_modified = df_new.kitchen_mean

In [214]:
df_needed = df_new[['kitchen_id']]
df_needed['kitchen_mean'] = price_modified_by_city_only

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [215]:
train_k = pd.merge(train, df_needed, on='kitchen_id', how='left')
test_k = pd.merge(test, df_needed, on='kitchen_id', how='left')

In [216]:
test_k['kitchen_mean'].fillna(df_new.kitchen_mean.mean(), inplace=True)

In [217]:
mse = mean_squared_error(test_k['price'], test_k['kitchen_mean'])
rmse = np.sqrt(mse)

In [218]:
print(rmse)

9.899922480173068


In [187]:
print(rmse)

9.899922480173068
