In [9]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)
enc = LabelEncoder()

# Load data
data = pd.read_csv("../input/GrammarandProductReviews.csv")

# Extract target variable
data.dropna(axis=0, subset=['reviews.didPurchase'], inplace=True)  #Clean NaN for target variable
y = data[['reviews.didPurchase']]
Y = y.replace({True:1, False:0}) #Encode

# Extract predictor variables
X = data.drop(['upc','manufacturerNumber','ean','id','keys','reviews.didPurchase','reviews.dateSeen','reviews.id','reviews.sourceURLs','reviews.text',
               'reviews.title','reviews.userCity','reviews.userProvince','reviews.username','reviews.date','reviews.dateAdded','dateAdded','dateUpdated']
              , axis=1) #Pick relevant predictor variables
X.loc[:,'brand'] = enc.fit_transform(np.ravel(data[['brand']])) #Encode
X.loc[:,'categories'] = enc.fit_transform(np.ravel(data[['categories']]))
X.loc[:,'manufacturer'] = enc.fit_transform(np.ravel(data[['manufacturer']]))
X.loc[:,'name'] = enc.fit_transform(np.ravel(data[['name']]))
X.loc[:,'reviews.doRecommend'] = enc.fit_transform(np.ravel(data[['reviews.doRecommend']]))

# Separate train and test data
train_x, test_x, train_y, test_y = train_test_split(X.values, Y.values, test_size=0.25)

# Impute Nan in X
imputer = SimpleImputer()
train_X = imputer.fit_transform(train_x)
test_X = imputer.transform(test_x)

# Train model
model = XGBRegressor()
model.fit(train_X, train_y, verbose=False)

# Test model
pred_y = model.predict(test_X)
print("Mean absolute error: " + str(mean_absolute_error(pred_y, test_y)))


Mean absolute error: 0.103813670223
