In [1]:
import os

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

%matplotlib inline


class SentimentPrediction:
    def __init__(self, 
                 train_path=None,
                 test_path=None, 
                 train_rows=10_000,
                 test_rows=5_000):
        
        #init paths and params
        self.train_path = train_path
        self.test_path = test_path
        self.train_rows=train_rows
        self.test_rows = test_rows
        #load train and test data
        self.df_train = pd.read_csv(train_path, 
                                      nrows=self.train_rows, 
                                      header=None,
                                      names=['label', 'title', 'review'])
        self.df_test = pd.read_csv(test_path, 
                                      nrows=self.train_rows, 
                                      header=None,
                                      names=['label', 'title', 'review'])
        #quick fix labels
        self.df_train.replace({'label': {1:0, 2:1}}, inplace=True)
        self.df_test.replace({'label': {1:0, 2:1}}, inplace=True)
        
        #merge title+review into one column
        self.df_train['text'] = self.df_train['title'] + self.df_train['review']
        self.df_train = self.df_train.drop(labels=['title', 'review'], axis=1)
        
        self.df_test['text'] = self.df_test['title'] + self.df_test['review']
        self.df_test = self.df_test.drop(labels=['title', 'review'], axis=1)
        
        #drop rows with a missing label and/or missing text
        self.df_train.dropna(axis=0, how='any', inplace=True)
        self.df_test.dropna(axis=0, how='any', inplace=True)
        
        
    def fit_Tfidf(self,
                  strip_accents='ascii', 
                  lowercase=True,
                  analyzer = 'word',
                  stop_words='english',
                  token_pattern = r'(?u)\b\w\w+\b',
                  max_df = 0.95,
                  min_df = 5
                 ):
        
        self.tfidf = TfidfVectorizer(strip_accents=strip_accents, 
                        lowercase=lowercase,
                        analyzer = 'word',
                        stop_words=stop_words,
                        token_pattern = token_pattern,
                        max_df = max_df,
                        min_df = min_df
                       )
        
        self.X_train = self.tfidf.fit_transform(self.df_train['text'])
        self.y_train = self.df_train['label']
        
        self.X_test = self.tfidf.transform(self.df_test['text'])
        self.y_test = self.df_test['label']
        
    def train_xgb(self,
                  params = {"max_depth": 2, 
                            "eta": 1, 
                            "objective": "binary:logistic"}, 
                  num_round=10):
        self.dtrain = xgb.DMatrix(data=self.X_train, label=self.y_train)
        self.dtest = xgb.DMatrix(data=self.X_test, label=self.y_test)
        

        
        self.estimator = XGBClassifier(**params)
        
        self.estimator.fit(self.X_train, self.y_train)
        
    def gridSearch(self, param_grid, verbose=10):
        self.grid_search = GridSearchCV(
            estimator=XGBClassifier(),
            param_grid=param_grid,
            scoring = 'accuracy',
            n_jobs = -1,
            cv = 5,
            verbose=verbose)
        self.grid_search.fit(self.X_train, self.y_train)
        
        
    def predict_labels(self):
        
        #check if user has done a grid search
        if hasattr(self, 'grid_search'):
            self.y_pred = self.grid_search.best_estimator_.predict(self.X_test)
        else:
            self.y_pred = self.estimator.predict(self.X_test)
        
        print(confusion_matrix(self.y_test, self.y_pred))
        return np.mean(self.y_pred == self.y_test)

In [2]:
train_path = os.path.join('raw_data', 'train.csv')
test_path = os.path.join('raw_data', 'test.csv')

sentiment = SentimentPrediction(train_path, test_path)
sentiment.fit_Tfidf()

sentiment.train_xgb()





In [3]:
sentiment.predict_labels()

[[3883  991]
 [1016 4109]]


0.7992799279927992

In [22]:
param_grid = {
    'max_depth': [2,1],
    'eta': [1, 2],
    'n_estimators': [50, 100],
    'learning_rate': [0.5, 1],
    'objective': ['binary:logistic']
}

sentiment.gridSearch(param_grid)
sentiment.predict_labels()

Fitting 5 folds for each of 16 candidates, totalling 80 fits




[[3900  974]
 [ 978 4147]]


0.8047804780478047

In [20]:
import boto3
import sagemaker
import boto3.session
from sagemaker.session import Session

AWS_ACCESS_KEY = 'Major Key Alert'
AWS_SECRET = 'DJ KHALED'

boto_session = boto3.session.Session(
   aws_access_key_id=AWS_ACCESS_KEY,
   aws_secret_access_key=AWS_SECRET,
   region_name='us-east-2'
)
sagemaker_session = Session(boto_session=boto_session)

In [21]:
sagemaker_session.read_s3_file(bucket='amazon-reviews-sentiment-nk', key_prefix='train.csv')

ClientError: An error occurred (SignatureDoesNotMatch) when calling the GetObject operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.