In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from joblib import dump, load

from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train['train_test'] = 'train'
test['train_test'] = 'test'

data = pd.concat([train, test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [3]:
data.head()

Unnamed: 0,country,description,designation,id,index,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,train_test,variety,winery
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,32027,,88.870874,20.0,Alentejano,,,,,,train,PORTUGUESE RED,J. Portugal Ramos
1,France,"A solid, chunky wine, with a structure that is...",,71079,,88.041695,28.0,Bordeaux,Lalande de Pomerol,,,,,train,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier
2,France,"This is powerful and concentrated, with the hi...",,32440,,94.085021,130.0,Bordeaux,Saint-Émilion,,,,,train,BORDEAUX-STYLE RED BLEND,Château Figeac
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,124405,,89.869797,34.0,California,Santa Barbara County,Central Coast,,,Jaffurs 2010 Thompson Vineyard Petite Sirah (S...,train,PETITE SIRAH,Jaffurs
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,33649,,89.017651,24.0,Washington,Horse Heaven Hills,Columbia Valley,Sean P. Sullivan,@wawinereport,Syncline 2016 McKinley Springs Vineyard Rosé (...,train,ROSé,Syncline


In [4]:
data.variety.fillna(value='PINOT NOIR', inplace=True)

In [5]:
data.province.fillna(value='California', inplace=True)

In [6]:
#data = data[pd.notnull(data['variety'])]
#data = data[pd.notnull(data['province'])]

In [7]:
data.taster_name.fillna(value='Roger Voss', inplace=True)

In [8]:
data.title.fillna(value='Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma County)', inplace=True)


In [9]:
data['province'] = pd.Categorical(data['province'])
proDummies = pd.get_dummies(data['province'], prefix = 'prov')

data['country'] = pd.Categorical(data['country'])
countDummies = pd.get_dummies(data['country'], prefix = 'country')

data['variety'] = pd.Categorical(data['variety'])
varDummies = pd.get_dummies(data['variety'], prefix = 'variety')

data['taster_name'] = pd.Categorical(data['taster_name'])
tastDummies = pd.get_dummies(data['taster_name'], prefix = 'taster')

#data['title'] = pd.Categorical(data['title'])
#titleDummies = pd.get_dummies(data['title'], prefix = 'title')

In [10]:
def getQuality(points):
    if(points <= 85):
        return 'bad'
    elif(points<=90 ):
        return 'ok'
    elif(points<=95):
        return 'good'
    elif(points<=102):
        return 'great'
    else:
        return 'If this gets hit, we did something wrong!'

In [11]:
data['quality'] = data['points'].apply(getQuality)
cleanup_nums = {"quality":     {"ok": 2, "good": 3, "bad": 1, "great": 4},
                "priceRange": {"1-30": 1, "31-50": 2, "51-100": 3, "Above 100": 4}}

data.replace(cleanup_nums, inplace=True)

In [12]:
data.head(2)

Unnamed: 0,country,description,designation,id,index,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,train_test,variety,winery,quality
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,32027,,88.870874,20.0,Alentejano,,,Roger Voss,,Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma...,train,PORTUGUESE RED,J. Portugal Ramos,2
1,France,"A solid, chunky wine, with a structure that is...",,71079,,88.041695,28.0,Bordeaux,Lalande de Pomerol,,Roger Voss,,Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma...,train,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier,2


In [13]:
def make_lower_case(text):
    return text.lower()

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [14]:
data["description"] = data["description"].str.replace('\d+', '')
data["description"] = data.description.apply(func=remove_punctuation)
data["description"] = data.description.apply(func=make_lower_case)

In [15]:
data["variety"] = data["variety"].str.replace('\d+', '')
data["variety"] = data.variety.apply(func=remove_punctuation)
data["variety"] = data.variety.apply(func=make_lower_case)

In [16]:
data["title"] = data["title"].str.replace('\d+', '')
data["title"] = data.title.apply(func=remove_punctuation)
data["title"] = data.title.apply(func=make_lower_case)

In [17]:
tf = TfidfVectorizer(analyzer='word', 
                     min_df=10,
                     ngram_range=(1, 2),
                     stop_words='english')
svd = TruncatedSVD(n_components=100)

In [18]:
#Fit tfidf and svd, and transform training data
tfidf_matrix = tf.fit_transform(data.description)
print(tfidf_matrix.shape)
desc_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))
collist = map(str, range(0, 100))
collist = ["description_" + s for s in collist]
desc_features.columns = collist
desc_features.head()

(258210, 94009)


Unnamed: 0,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,description_8,description_9,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
0,0.115367,0.004636,0.114803,0.051522,0.006657,-0.030244,-0.025,0.041972,0.028911,-0.010496,...,0.007634,-0.039814,-0.049792,0.001087,0.014499,-0.002223,-0.008103,-0.02244,-0.004361,-0.024764
1,0.119707,0.024791,0.072113,0.021115,-0.011476,-0.062648,-0.00037,0.031289,0.015392,0.004811,...,-0.01797,-0.022855,-0.046821,-0.05506,0.007859,-0.001449,0.029534,-0.015895,0.02923,0.02159
2,0.129262,0.101702,0.087817,0.003713,0.131584,-0.033413,-0.016847,0.006005,0.069128,-0.006386,...,0.039654,-0.009324,-0.031001,-0.015154,-2.1e-05,-0.020974,0.012956,-0.029917,0.011128,0.014558
3,0.1035,0.062245,0.01242,-0.064549,-0.020325,-0.063674,-0.015062,-0.008192,0.045339,0.048647,...,0.014208,0.005733,-0.005161,-0.032666,0.017241,-0.013232,-0.002666,0.013726,-0.036283,-0.036276
4,0.096223,-0.056786,-0.015269,-0.012826,0.023463,0.028226,0.005504,0.016063,-0.026481,0.030409,...,0.000366,0.022666,-0.026078,-0.005888,0.011081,-0.005803,0.014026,-0.024345,-0.020775,-0.002401


In [19]:
tfidf_matrix = tf.fit_transform(data.variety)
svd = TruncatedSVD(n_components=10)
var_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))
collist = map(str, range(0, 10))
collist = ["variety_" + s for s in collist]
var_features.columns = collist
var_features.head()

Unnamed: 0,variety_0,variety_1,variety_2,variety_3,variety_4,variety_5,variety_6,variety_7,variety_8,variety_9
0,3.554817e-06,1.32117e-05,0.04913868,0.247637,2.2e-05,0.000504,4.401445e-05,0.000186,-0.000477,-0.182606
1,1.037505e-05,4.051445e-05,0.1488023,0.748054,-6.3e-05,0.000758,8.711956e-05,-2.5e-05,-0.000224,0.613893
2,1.037504e-05,4.051444e-05,0.1488023,0.748054,-6.3e-05,0.000758,8.711956e-05,-2.5e-05,-0.000224,0.613893
3,3.096453e-07,9.704166e-07,5.104758e-07,-1e-06,-9e-06,-4.6e-05,0.0007826015,0.000383,-0.001378,-0.000467
4,4.706509e-09,-1.395028e-08,8.682955e-07,4e-06,-8e-06,-4e-06,1.585286e-07,-3.4e-05,4.2e-05,-0.000367


In [20]:
svd = TruncatedSVD(n_components=10)
prov_features = pd.DataFrame(svd.fit_transform(proDummies.values))
collist = map(str, range(0, 10))
collist = ["province_" + s for s in collist]
prov_features.columns = collist
prov_features.head()

Unnamed: 0,province_0,province_1,province_2,province_3,province_4,province_5,province_6,province_7,province_8,province_9
0,-1.931522e-09,1.857487e-06,-1.778172e-05,6.973988e-05,-7.953516e-05,9.222137e-05,-0.0002718437,0.0001464819,0.0008404838,-0.0006395285
1,-4.048326e-12,6.378265e-08,-3.356244e-07,1.470044e-06,-1.292386e-05,1.516791e-05,0.9999926,3.073662e-05,1.369581e-05,4.881347e-05
2,-4.047687e-12,6.378343e-08,-3.356171e-07,1.47005e-06,-1.292386e-05,1.516791e-05,0.9999926,3.073662e-05,1.369581e-05,4.881347e-05
3,1.0,3.286889e-15,7.688128e-15,5.641616e-14,-5.900364e-14,-4.357905e-14,9.970831e-14,3.753023e-14,1.44846e-14,3.776289e-14
4,3.763676e-15,1.0,-1.336011e-09,6.481104e-09,-8.68715e-09,8.474905e-09,-5.93642e-09,3.32133e-09,1.190673e-09,-9.127659e-09


In [21]:
#tfidf_matrix = tf.fit_transform(data.region_1)
#reg1_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))
#collist = map(str, range(0, 10))
#collist = ["country_" + s for s in collist]
#reg1_features.columns = collist
#reg1_features.head()

In [22]:
# country_features = pd.DataFrame(svd.fit_transform(countDummies.values))
# collist = map(str, range(0, 10))
# collist = ["country_" + s for s in collist]
# country_features.columns = collist
# country_features.head()

In [23]:
# dd = pd.concat([desc_features, var_features, prov_features, country_features], axis=1)
dd = pd.concat([desc_features, var_features, prov_features], axis=1)
dd.shape

(258210, 120)

In [24]:
#data.drop('index', axis=1, inplace=True)

In [25]:
dd['train_test'] = data.train_test.values

In [26]:
dd['id'] = data.id.values

In [27]:
dd['quality'] = data.quality.values

In [28]:
dd['points'] = data.points.values

In [29]:
dd['price'] = data.price.values

In [30]:
#['id', 'points', 'quality', 'price']
final_data = dd.copy()
final_data.shape

(258210, 125)

In [31]:
final_data['div_points'] = final_data['points'].values / 2
final_data['div_points_4'] = final_data['points'].values / 4
final_data['div_points_8'] = final_data['points'].values / 8

In [32]:
final_data['sum_desc'] = final_data[desc_features.columns.tolist()].sum(axis=1)
# final_data['sum_var'] = final_data[['variety_0', 'variety_1', 'variety_2',
#        'variety_3', 'variety_4', 'variety_5', 'variety_6', 'variety_7',
#        'variety_8', 'variety_9']].sum(axis=1)
# final_data['sum_prov'] = final_data[['province_0', 'province_1', 'province_2',
#        'province_3', 'province_4', 'province_5', 'province_6', 'province_7',
#        'province_8', 'province_9']].sum(axis=1)
# final_data['sum_country'] = final_data[['variety_0', 'variety_1', 'variety_2',
#        'variety_3', 'variety_4', 'variety_5', 'variety_6', 'variety_7',
#        'variety_8', 'variety_9']].sum(axis=1)

In [33]:
# final_data['description_0_x4'] = final_data['description_0'].multiply(4)
# final_data['description_0_x4'].head()

In [34]:
# final_data['description_1_x4'] = final_data['description_1'].multiply(4)
# final_data['description_1_x4'].head()

In [35]:
#final_data['div_points'].head(10)
final_data.head()

Unnamed: 0,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,description_8,description_9,...,province_9,train_test,id,quality,points,price,div_points,div_points_4,div_points_8,sum_desc
0,0.115367,0.004636,0.114803,0.051522,0.006657,-0.030244,-0.025,0.041972,0.028911,-0.010496,...,-0.0006395285,train,32027,2,88.870874,20.0,44.435437,22.217719,11.108859,0.167713
1,0.119707,0.024791,0.072113,0.021115,-0.011476,-0.062648,-0.00037,0.031289,0.015392,0.004811,...,4.881347e-05,train,71079,2,88.041695,28.0,44.020848,22.010424,11.005212,-0.085931
2,0.129262,0.101702,0.087817,0.003713,0.131584,-0.033413,-0.016847,0.006005,0.069128,-0.006386,...,4.881347e-05,train,32440,3,94.085021,130.0,47.042511,23.521255,11.760628,0.509493
3,0.1035,0.062245,0.01242,-0.064549,-0.020325,-0.063674,-0.015062,-0.008192,0.045339,0.048647,...,3.776289e-14,train,124405,2,89.869797,34.0,44.934899,22.467449,11.233725,-0.184511
4,0.096223,-0.056786,-0.015269,-0.012826,0.023463,0.028226,0.005504,0.016063,-0.026481,0.030409,...,-9.127659e-09,train,33649,2,89.017651,24.0,44.508825,22.254413,11.127206,0.201946


In [36]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

In [37]:
pf = PolynomialFeatures(degree=2, interaction_only=False,  
                        include_bias=False)
res = pf.fit_transform(final_data[['points', 'quality']])
res

array([[8.88708744e+01, 2.00000000e+00, 7.89803232e+03, 1.77741749e+02,
        4.00000000e+00],
       [8.80416953e+01, 2.00000000e+00, 7.75134011e+03, 1.76083391e+02,
        4.00000000e+00],
       [9.40850213e+01, 3.00000000e+00, 8.85199123e+03, 2.82255064e+02,
        9.00000000e+00],
       ...,
       [8.90575848e+01, 2.00000000e+00, 7.93125341e+03, 1.78115170e+02,
        4.00000000e+00],
       [8.49079088e+01, 1.00000000e+00, 7.20935297e+03, 8.49079088e+01,
        1.00000000e+00],
       [9.20387015e+01, 3.00000000e+00, 8.47112258e+03, 2.76116105e+02,
        9.00000000e+00]])

In [38]:
#pd.DataFrame(pf.powers_, columns=['points_degree',  'quality_degree'])

In [39]:
intr_features = pd.DataFrame(res, columns=['points', 'quality',  
                                           'points^2', 
                                           'points x quality',  
                                           'quality^2'])
intr_features.head(5)

Unnamed: 0,points,quality,points^2,points x quality,quality^2
0,88.870874,2.0,7898.032315,177.741749,4.0
1,88.041695,2.0,7751.340111,176.083391,4.0
2,94.085021,3.0,8851.991227,282.255064,9.0
3,89.869797,2.0,8076.580454,179.739594,4.0
4,89.017651,2.0,7924.142166,178.035302,4.0


In [40]:
intr_features['points_log'] = np.log((1 + intr_features['points']))
intr_features['points x quality_log'] = np.log((1 + intr_features['points x quality']))
intr_features['points^2_log'] = np.log((1 + intr_features['points^2']))

import scipy.stats as spstats

l, opt_lambda = spstats.boxcox(intr_features['points'].values)

intr_features['points_boxcox_lambda_opt'] = spstats.boxcox((1 + intr_features['points']), 
                                          lmbda=opt_lambda)
intr_features['points^2_boxcox_lambda_opt'] = spstats.boxcox((1 + intr_features['points^2']), 
                                          lmbda=opt_lambda)
intr_features['points x quality_boxcox_lambda_opt'] = spstats.boxcox((1 + intr_features['points x quality']), 
                                          lmbda=opt_lambda)

In [41]:
intr_features.head()

Unnamed: 0,points,quality,points^2,points x quality,quality^2,points_log,points x quality_log,points^2_log,points_boxcox_lambda_opt,points^2_boxcox_lambda_opt,points x quality_boxcox_lambda_opt
0,88.870874,2.0,7898.032315,177.741749,4.0,4.498374,5.185942,8.974496,5.046263,11.334832,5.922886
1,88.041695,2.0,7751.340111,176.083391,4.0,4.489105,5.176621,8.95575,5.03465,11.305444,5.910799
2,94.085021,3.0,8851.991227,282.255064,9.0,4.554771,5.646348,9.088511,5.117035,11.51418,6.527014
3,89.869797,2.0,8076.580454,179.739594,4.0,4.509428,5.197057,8.996848,5.060118,11.369912,5.937307
4,89.017651,2.0,7924.142166,178.035302,4.0,4.500006,5.187583,8.977796,5.048308,11.340009,5.925015


In [42]:
intr_features.drop(['points', 'quality'], inplace=True, axis=1)

In [43]:
ff = pd.concat([final_data, intr_features], axis=1)

In [44]:
print(ff.shape)

(258210, 138)


In [45]:
train_ff = ff[ff['train_test'] == 'train']
test_ff = ff[ff['train_test'] == 'test']

In [46]:
# c = ['points_log', 'points x quality_log', 'points_boxcox_lambda_opt','points^2_boxcox_lambda_opt', 'points x quality_boxcox_lambda_opt']

In [47]:
#col = ['description_0', 'description_1', 'description_2', 'description_3', 'description_4', 'description_5', 'description_6', 'description_7', 'description_8', 'description_9', 'sum_desc', 'description_0_x4', 'description_1_x4', 'points', 'points^2', 'points_log', 'points^2_log', 'points x quality_log', 'points_boxcox_lambda_opt','points^2_boxcox_lambda_opt', 'points x quality_boxcox_lambda_opt']
#col = ['description_0', 'description_1', 'description_2', 'description_3', 'description_4', 'description_5', 'description_6', 'description_7', 'description_8', 'description_9', 'variety_0', 'variety_1', 'variety_2', 'variety_3', 'variety_4', 'variety_5', 'variety_6', 'variety_7', 'variety_8', 'variety_9', 'province_0', 'province_1', 'province_2', 'province_3', 'province_4', 'province_5', 'province_6', 'province_7', 'province_8', 'province_9', 'country_0', 'country_1', 'country_2', 'country_3', 'country_4', 'country_5', 'country_6', 'country_7', 'country_8', 'country_9', 'sum_desc', 'sum_var', 'sum_prov', 'sum_country', 'description_0_x4', 'description_1_x4', 'points', 'points^2']

In [48]:
#print(intr_features.columns.tolist())
#cc = ['points^2', 'points x quality', 'points_log', 'points_boxcox_lambda_opt', 'points^2_boxcox_lambda_opt', 'points x quality_boxcox_lambda_opt']

In [49]:
#cc1 = ['points^2', 'points_log', 'points_boxcox_lambda_opt', 'points^2_boxcox_lambda_opt']

In [50]:
#print(train_ff.columns.tolist())

In [51]:
cols = desc_features.columns.tolist()+var_features.columns.tolist()+prov_features.columns.tolist()+ ['points', 'div_points', 'div_points_4', 'div_points_8', 'sum_desc', 'points^2', 'points x quality', 'points_log', 'points x quality_log', 'points^2_log', 'points_boxcox_lambda_opt', 'points^2_boxcox_lambda_opt', 'points x quality_boxcox_lambda_opt']

In [52]:
X = train_ff[cols]
y = train_ff["price"]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [54]:
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [55]:
lg = lgb.LGBMRegressor()
lg.fit(X_train,y_train)


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [56]:
lb_pred = lg.predict(X_test)

In [57]:
print('RMSE:', np.sqrt(mean_squared_error(y_test, lb_pred)))

RMSE: 28.154361589321407


In [58]:
model = xgb.XGBRegressor()

In [None]:
model.fit(X_train, y_train)



  if getattr(data, 'base', None) is not None and \


In [None]:
xgb_pred = model.predict(X_test)

In [None]:
print('RMSE:', np.sqrt(mean_squared_error(y_test, xgb_pred)))

In [None]:
lm = Ridge()

lm.fit(X_train, y_train)

In [None]:
print(lm.intercept_)

In [None]:
predictions = lm.predict(X_test)
plt.scatter(np.log(y_test),predictions)

In [None]:
print('RMSE:', np.sqrt(mean_squared_error(y_test, predictions)))

In [None]:
accuracy = lm.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

In [None]:
rf = RandomForestRegressor()

In [None]:
rf = RandomForestRegressor(n_estimators = 10, random_state = 42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
predicts = rf.predict(X_test)

In [None]:
print(y_test.shape, predicts.shape)

In [None]:
plt.scatter(y_test, predicts)

In [None]:
print('RMSE:', np.sqrt(mean_squared_error(y_test, predicts)))

In [None]:
importance = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(rf.feature_importances_)],axis=1)
importance.columns = ['feature','value']

plt.figure(figsize=(15,8))
sns.barplot(x="feature", y="value", data=importance).set_xticklabels(rotation=90,labels=X_train.columns)
print('')

In [None]:
nnnn

In [None]:
#print(test_data.columns.tolist())

In [None]:
X_test_t = test_ff[cols]

In [None]:
test_ff['price'] = rf.predict(X_test_t)
test_ff['price'].head(10)

In [None]:
test_ff['price_lm'] = lm.predict(X_test_t)
test_ff['price_lm'].head(10)

In [None]:
test_ff['price_xgb'] = model.predict(X_test_t)
test_ff['price_xgb'].head(10)

In [None]:
test_ff.head()

In [None]:
nnn

In [None]:
test_ff[['id', 'price']].to_csv('bma_predicted_rf_rmse23_11.csv', index=False)

In [None]:
#pd.DataFrame(test_data_sss).to_csv('random_f.csv')

In [None]:
import pickle

In [None]:
pickle.dump(rf,open('rf_model_rmse23_19.sav','wb'))