In [40]:
import xgboost as xg
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [41]:
#train_data=pd.read_csv("training_data.csv")
#test_data=pd.read_csv("test_data.csv")
train_data=pd.read_csv("train_after_nltk_without_negative.csv", index_col=0)
test_data=pd.read_csv("test_after_nltk_without_negative.csv", index_col=0)

In [42]:
train_data["text"] = train_data["text"].astype(str)
test_data["text"] = test_data["text"].astype(str)
train_text = train_data["text"]
train_label = train_data["stars"]
test_Idx = test_data["review_id"]
test_text = test_data["text"]

In [43]:
# Use TF-IDF to vectorize the text with minimun DF = 2 to filt out some strange words
tfidf_vectorizer = TfidfVectorizer(min_df=2)
tfidf_vectorizer.fit(train_text)
#for train
train_text_tfidf = tfidf_vectorizer.transform(train_text)
#for test
test_text_tfidf = tfidf_vectorizer.transform(test_text)

In [44]:
print(train_text_tfidf.shape)
print(test_text_tfidf.shape)

(7997, 10972)
(2003, 10972)


In [47]:
train_data = train_text_tfidf.toarray()
test_data = test_text_tfidf.toarray()
del train_text, test_text, train_text_tfidf, test_text_tfidf

In [49]:
# Train the Xgboost Model with 5 folds CV 
parameters = {'objective':['reg:linear'],
              'learning_rate': [0.05],
              'max_depth': [4],
              'min_child_weight': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.8],
              'gamma':[0],
              'n_estimators': [300],
              'silent':[1],
              'seed': [9]
             }
xgb_model = xg.XGBRegressor()

clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(train_label, n_folds=5, shuffle=True), 
                   scoring='neg_mean_squared_error', refit=True)

clf.fit(train_data, train_label)

best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('RMSE score:', -1*score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

RMSE score: 0.8789535680110622
colsample_bytree: 0.8
gamma: 0
learning_rate: 0.05
max_depth: 4
min_child_weight: 1
n_estimators: 300
objective: 'reg:linear'
seed: 9
silent: 1
subsample: 0.8


In [53]:
# Evaluate the model
X_train, X_valid, y_train, y_valid = train_test_split(train_data,train_label,test_size=0.2,random_state=9)
y_valid_pred = clf.predict(X_valid)
accuracy = accuracy_score(y_valid, np.round(y_valid_pred).astype(int))
rmse = sqrt(mean_squared_error(y_valid, y_valid_pred))
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("RMSE: %.2f" % rmse)

Accuracy: 46.56%
RMSE: 0.81


In [54]:
# Predict on the test data
test_Id = pd.DataFrame({"id": test_Idx})
predictions = clf.predict(test_data)
submission = pd.DataFrame({"stars": predictions})
submission = test_Id.join(submission)
submission.to_csv("result_nltk_xgb.csv", index=False,header=False)

In [55]:
submission.head()

Unnamed: 0,id,stars
0,2713,2.955585
1,4734,4.160178
2,5598,4.194613
3,9545,3.79575
4,1471,3.254804
