In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from joblib import dump, load

from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train['train_test'] = 'train'
test['train_test'] = 'test'

data = pd.concat([train, test])

FileNotFoundError: File b'data/train.csv' does not exist

In [None]:
data.shape

In [None]:
test.shape

In [None]:
data.head()

In [None]:
data['province'].mode()

In [None]:
data.variety.fillna(value='PINOT NOIR', inplace=True)

In [None]:
data.province.fillna(value='California', inplace=True)

In [None]:
#data = data[pd.notnull(data['variety'])]
#data = data[pd.notnull(data['province'])]

In [None]:
data.taster_name.fillna(value='Roger Voss', inplace=True)
data.info()

In [None]:
data.title.fillna(value='Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma County)', inplace=True)
data.info()

In [None]:
data['province'] = pd.Categorical(data['province'])
proDummies = pd.get_dummies(data['province'], prefix = 'prov')

data['country'] = pd.Categorical(data['country'])
countDummies = pd.get_dummies(data['country'], prefix = 'country')

data['variety'] = pd.Categorical(data['variety'])
varDummies = pd.get_dummies(data['variety'], prefix = 'variety')

data['taster_name'] = pd.Categorical(data['taster_name'])
tastDummies = pd.get_dummies(data['taster_name'], prefix = 'taster')

#data['title'] = pd.Categorical(data['title'])
#titleDummies = pd.get_dummies(data['title'], prefix = 'title')

In [None]:
def getQuality(points):
    if(points <= 85):
        return 'bad'
    elif(points<=90 ):
        return 'ok'
    elif(points<=95):
        return 'good'
    elif(points<=102):
        return 'great'
    else:
        return 'If this gets hit, we did something wrong!'

In [None]:
data['quality'] = data['points'].apply(getQuality)
cleanup_nums = {"quality":     {"ok": 2, "good": 3, "bad": 1, "great": 4},
                "priceRange": {"1-30": 1, "31-50": 2, "51-100": 3, "Above 100": 4}}

data.replace(cleanup_nums, inplace=True)

In [None]:
data.head()

In [None]:
def make_lower_case(text):
    return text.lower()

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [None]:
data["description"] = data["description"].str.replace('\d+', '')
data["description"] = data.description.apply(func=remove_punctuation)
data["description"] = data.description.apply(func=make_lower_case)

In [None]:
data["variety"] = data["variety"].str.replace('\d+', '')
data["variety"] = data.variety.apply(func=remove_punctuation)
data["variety"] = data.variety.apply(func=make_lower_case)

In [None]:
data["title"] = data["title"].str.replace('\d+', '')
data["title"] = data.title.apply(func=remove_punctuation)
data["title"] = data.title.apply(func=make_lower_case)

In [None]:
tf = TfidfVectorizer(analyzer='word', 
                     min_df=10,
                     ngram_range=(1, 2),
                     stop_words='english')
svd = TruncatedSVD(n_components=10)

In [None]:
#Fit tfidf and svd, and transform training data
tfidf_matrix = tf.fit_transform(data.description)
desc_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))
collist = map(str, range(0, 10))
collist = ["description_" + s for s in collist]
desc_features.columns = collist
desc_features.head()

In [None]:
tfidf_matrix = tf.fit_transform(data.variety)
var_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))
collist = map(str, range(0, 10))
collist = ["variety_" + s for s in collist]
var_features.columns = collist
var_features.head()

In [None]:
prov_features = pd.DataFrame(svd.fit_transform(proDummies.values))
collist = map(str, range(0, 10))
collist = ["province_" + s for s in collist]
prov_features.columns = collist
prov_features.head()

In [None]:
#tfidf_matrix = tf.fit_transform(data.region_1)
#reg1_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))
#collist = map(str, range(0, 10))
#collist = ["country_" + s for s in collist]
#reg1_features.columns = collist
#reg1_features.head()

In [None]:
country_features = pd.DataFrame(svd.fit_transform(countDummies.values))
collist = map(str, range(0, 10))
collist = ["country_" + s for s in collist]
country_features.columns = collist
country_features.head()

In [None]:
dd = pd.concat([desc_features, var_features, prov_features, country_features], axis=1)
dd.shape

In [None]:
dd.head()

In [None]:
dd.shape

In [None]:
#data.drop('index', axis=1, inplace=True)

In [None]:
dd['train_test'] = data.train_test.values

In [None]:
dd['id'] = data.id.values

In [None]:
dd['quality'] = data.quality.values

In [None]:
dd['points'] = data.points.values

In [None]:
dd['price'] = data.price.values

In [None]:
#['id', 'points', 'quality', 'price']
final_data = dd.copy()
final_data.shape

In [None]:
final_data['div_points'] = final_data['points'].values / 2
final_data['div_points_4'] = final_data['points'].values / 4
final_data['div_points_8'] = final_data['points'].values / 8

In [None]:
final_data['sum_desc'] = final_data[['description_0', 'description_1', 'description_2', 'description_3',
       'description_4', 'description_5', 'description_6', 'description_7',
       'description_8', 'description_9']].sum(axis=1)
final_data['sum_var'] = final_data[['variety_0', 'variety_1', 'variety_2',
       'variety_3', 'variety_4', 'variety_5', 'variety_6', 'variety_7',
       'variety_8', 'variety_9']].sum(axis=1)
final_data['sum_prov'] = final_data[['province_0', 'province_1', 'province_2',
       'province_3', 'province_4', 'province_5', 'province_6', 'province_7',
       'province_8', 'province_9']].sum(axis=1)
final_data['sum_country'] = final_data[['variety_0', 'variety_1', 'variety_2',
       'variety_3', 'variety_4', 'variety_5', 'variety_6', 'variety_7',
       'variety_8', 'variety_9']].sum(axis=1)

In [None]:
final_data['description_0_x4'] = final_data['description_0'].multiply(4)
final_data['description_0_x4'].head()

In [None]:
final_data['description_1_x4'] = final_data['description_1'].multiply(4)
final_data['description_1_x4'].head()

In [None]:
#final_data['div_points'].head(10)
final_data.head()

In [None]:
train_data = final_data[final_data['train_test'] == 'train']
test_data = final_data[final_data['train_test'] == 'test']

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
#cols = ['description_0', 'description_1', 'description_2', 'description_3', 'description_4', 'description_5', 'description_6', 'description_7', 'description_8', 'description_9', 'variety_0', 'variety_1', 'variety_2', 'variety_3', 'variety_4', 'variety_5', 'variety_6', 'variety_7', 'variety_8', 'variety_9', 'province_0', 'province_1', 'province_2', 'province_3', 'province_4', 'province_5', 'province_6', 'province_7', 'province_8', 'province_9', 'country_0', 'country_1', 'country_2', 'country_3', 'country_4', 'country_5', 'country_6', 'country_7', 'country_8', 'country_9', 'sum_desc', 'sum_var', 'sum_prov', 'sum_country', 'description_0_x4', 'description_1_x4']
#cols2 = ['points', 'quality']

In [None]:
#X = train_data[cols2]
#y = train_data["price"]

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
#print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pf = PolynomialFeatures(degree=2, interaction_only=False,  
                        include_bias=False)
res = pf.fit_transform(final_data[['points', 'quality']])
res

In [None]:
pd.DataFrame(pf.powers_, columns=['points_degree',  
                                  'quality_degree'])

In [None]:
intr_features = pd.DataFrame(res, columns=['points', 'quality',  
                                           'points^2', 
                                           'points x quality',  
                                           'quality^2'])
intr_features.head(5)

In [None]:
intr_features['points_log'] = np.log((1 + intr_features['points']))

In [None]:
intr_features['points^2_log'] = np.log((1 + intr_features['points^2']))

In [None]:
intr_features['points x quality_log'] = np.log((1 + intr_features['points x quality']))

In [None]:
import scipy.stats as spstats

In [None]:
#intr_features['points_boxcox_lambda_0'] = spstats.boxcox((1 + intr_features['points']), lmbda=0)

In [None]:
#intr_features['points^2_boxcox_lambda_0'] = spstats.boxcox((1 + intr_features['points^2']), lmbda=0)

In [None]:
#intr_features['points x quality_boxcox_lambda_0'] = spstats.boxcox((1 + intr_features['points x quality']), lmbda=0)

In [None]:
l, opt_lambda = spstats.boxcox(intr_features['points'].values)
opt_lambda

In [None]:
intr_features['points_boxcox_lambda_opt'] = spstats.boxcox((1 + intr_features['points']), 
                                          lmbda=opt_lambda)

In [None]:
intr_features['points^2_boxcox_lambda_opt'] = spstats.boxcox((1 + intr_features['points^2']), 
                                          lmbda=opt_lambda)

In [None]:
intr_features['points x quality_boxcox_lambda_opt'] = spstats.boxcox((1 + intr_features['points x quality']), 
                                          lmbda=opt_lambda)

In [None]:
intr_features.head()

In [None]:
print(intr_features.shape)

In [None]:
intr_features.drop(['quality', 'quality^2'], inplace=True, axis=1)

In [None]:
final_data_c = final_data.drop(['points', 'quality'], axis=1)

In [None]:
ff = pd.concat([final_data_c, intr_features], axis=1)

In [None]:
print(ff.shape)

In [None]:
train_ff = ff[ff['train_test'] == 'train']
test_ff = ff[ff['train_test'] == 'test']

In [None]:
col_old2 = ['description_0', 'description_1', 'description_2', 'description_3', 'description_4', 'description_5', 'description_6', 'description_7', 'description_8', 'description_9', 'variety_2', 'variety_4', 'variety_6', 'variety_7', 'variety_9', 'province_0', 'province_5', 'province_6', 'province_7', 'province_9', 'country_0', 'country_2', 'country_9', 'sum_desc', 'sum_prov', 'description_0_x4', 'description_1_x4']
col_old1 = ['description_0', 'description_1', 'description_2', 'description_3', 'description_4', 'description_5', 'description_6', 'description_7', 'description_8', 'description_9', 'variety_2', 'variety_4', 'variety_6', 'variety_7', 'variety_9', 'province_5', 'province_6', 'province_7', 'province_9', 'country_0', 'sum_desc', 'sum_prov', 'description_0_x4', 'description_1_x4']
col = ['description_0', 'description_1', 'description_2', 'description_3', 'description_4', 'description_5', 'description_6', 'description_7', 'description_8', 'description_9', 'variety_7', 'province_6', 'sum_desc', 'sum_prov', 'description_0_x4', 'description_1_x4']

X = train_ff[col + intr_features.columns.tolist()]
y = train_ff["price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
#print(X_train.columns.tolist())

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
#Set xgboost parameters
param = {'max_depth': 3, 'eta': 1, 'subsample':0.5, 'alpha':1}
param['nthread'] = 4
param['eval_metric'] = 'mae'
param['objective'] = 'reg:linear'
param['silent'] = 1
evallist = [(dtrain, 'train')]
num_round = 10

#Train model
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

pred = pd.DataFrame(y_test.copy())
pred["prediction"] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
rmse = np.round(np.sqrt(mean_squared_error(y_true=pred["price"], y_pred=pred["prediction"])), 2)
print ("Mean Squared Error: {} ".format(rmse))

In [None]:
lm = Ridge()

lm.fit(X_train, y_train)

In [None]:
print(lm.intercept_)

In [None]:
predictions = lm.predict(X_test)
plt.scatter(np.log(y_test),predictions)

In [None]:
print('RMSE:', np.sqrt(mean_squared_error(y_test, predictions)))

In [None]:
accuracy = lm.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

In [None]:
rf = RandomForestRegressor()

In [None]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
predicts = rf.predict(X_test)

In [None]:
print(y_test.shape, predicts.shape)

In [None]:
plt.scatter(y_test, predicts)

In [None]:
print('RMSE:', np.sqrt(mean_squared_error(y_test, predicts)))

In [None]:
importance = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(rf.feature_importances_)],axis=1)
importance.columns = ['feature','value']

plt.figure(figsize=(15,8))
sns.barplot(x="feature", y="value", data=importance).set_xticklabels(rotation=90,labels=X_train.columns)
print('')

In [None]:
nnn

In [None]:
#print(test_data.columns.tolist())

In [None]:
X_test_t = test_ff[col + intr_features.columns.tolist()]

In [None]:
print(X_train.shape, X_test_t.shape)

In [None]:
test_ff['price'] = lm.predict(X_test_t)
test_ff['price'].head(10)

In [None]:
test_ff[['id', 'price']].to_csv('bma_price_predicted_rmse_lm29_86.csv', index=False)

In [None]:
#pd.DataFrame(test_data_sss).to_csv('random_f.csv')

In [None]:
import pickle

In [None]:
pickle.dump(rf,open('rf_model_rmse23_19.sav','wb'))