# General pipeline for project 1
This is an example pipeline showing you how to  
(1) Load the provided data;  
(2) Train models on the train set, and use the validation set to evaluate your model performance;  
(3) Generate predictions (pred.csv) on the test set, which is ready for submission.

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

### (1) Loading data
The following code shows how to load the datasets for this project.  
Among which, we do not release the labels (the "stars" column) for the test set. You may evaluate your trained model on the validation set instead.

However, your submitted predictions (``pred.csv``) should be generated on the test set.

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [3]:
train_df = load_data('train', columns=['review_id', 'text', 'stars'])
valid_df = load_data('valid', columns=['review_id', 'text', 'stars'])
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'])

select [review_id, text, stars] columns from the train split
Success
select [review_id, text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


In [4]:
# test_df.columns
print(train_df.columns)
print(valid_df.columns)
print(test_df.columns)
test_df

Index(['review_id', 'text', 'stars'], dtype='object')
Index(['review_id', 'text', 'stars'], dtype='object')
Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'text', 'useful',
       'user_id'],
      dtype='object')


Unnamed: 0,business_id,cool,date,funny,review_id,text,useful,user_id
0,V-qDa2kr5qWdhs7PU-l-3Q,0,2013-05-29,0,fBHWLNEJmhk6AkzmfLwWcw,Would like to give this more stars - usually I...,1,1pigoFijaHVWGrQl1_tYjw
1,C1zlvNlxlGZB8g0162QslQ,0,2012-03-02 15:51:49,0,ldEQ02aP1OeSa5N2beseNg,My wife and I took some friends here after din...,0,BKWPuPZFcGmgjRFRzoq1pw
2,0FOON_PNvG0ZxIZh6Jcv2A,0,2013-09-24 20:31:37,0,0oGr6v9VjtRsRsROGMoWTA,My husband and I had lunch here for the first ...,0,BYVYXKqNs-vv-N1ZhRMs0g
3,r49iBfbnfoK7yt4rdsL_7g,0,2018-10-20 01:34:08,0,eg5eJ5HmqXuzkxucnKvMTw,I love coming here with my friends! Great for ...,2,dpzmyNglDMeTgV3T5ylUSQ
4,xnLNPkL7bbdhD842T4oPqg,0,2016-09-25,1,BNDAe34Mxj--Brkzcfi4QA,Make sure that you double check how much these...,1,yk9wx31bfMEe_IXB8Q-ylA
...,...,...,...,...,...,...,...,...
3995,x_0Vf8AVBk_auLnNHRjoVA,2,2013-05-18 03:06:21,0,s7FLCfjgopRM6olA1NSccg,We live nearby and have stopped by this McDona...,0,Nf3VduiXhQVZRvM2GiXi-w
3996,KAJAsjVhYUPb6b_yodVqvA,0,2018-05-06 05:33:47,0,oJUnsu4PpTZz-kCE88-9uQ,It was boring as ever! All Spanish music so I ...,0,T3hk43jr0t7ZK8RPmce4sQ
3997,EnKpL0rRg1MTTKncmxbnMA,0,2012-03-21 20:49:25,0,celcHgmV26VvtzGdUFsR5w,"Was a long time customer, I was entertaining c...",1,WFWzzvWM45zTx-EShrVVxw
3998,-NR4KqS6lHseNvJ-GFzfMA,2,2016-08-14,1,69yY48SDj-UDCKlGgn-nqg,I really like this place! I like how you can t...,2,SS3sFA9ksCT9bjocM3Wbug


### (2) Training and validating 
The following example shows you how to train your model using the train set, and evaluate on the validation set.  
As an example, we only use the text data for training. Feel free to use other columns in your implementation.  

The model performance on the validation set can be roughly regarded as your models final performance, so we can use it to search for optimal hyper-parameters.

In [5]:
# Prepare the data.
# As an example, we only use the text data. 
x_train = np.array(train_df['text'])
y_train = np.array(train_df['stars'])
  
x_valid = np.array(valid_df['text'])
y_valid = np.array(valid_df['stars'])

x_test = np.array(test_df['text'])

 You can use the valid data to choose the hyperparameters.
As an example, you can decide which value of C (1 or 100) is better by evaluating on the valid data.

# Grid search

In [6]:
# # build the first linear model with TFIDF feature
tfidf = TfidfVectorizer()
svm = SVC()
steps = [('tfidf', tfidf), ('svm', svm)]
pipe = Pipeline(steps)

In [8]:
# define the parameters for grid search
param_grid = {
    'tfidf__max_features': [ 8000, 9000, 11000, 10000, 12000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'svm__C': [1, 1.5, 2, 2.5, 3, 4],
    'svm__kernel': ['linear', 'rbf']
}

# perform grid search with cross-validation
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=1, n_jobs=15)
grid_search.fit(x_train, y_train)

# print the best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 180 candidates, totalling 900 fits




Best parameters: {'svm__C': 1.5, 'svm__kernel': 'rbf', 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best score: 0.6543


# Train with best model

In [15]:
# define the pipeline with the best parameters
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
svm = SVC(C=2, kernel='rbf')
pipe = Pipeline(steps=[('tfidf', tfidf), ('svm', svm)])

# fit the pipeline on the training data
pipe.fit(x_train, y_train)

# Evaluate on the validation set and predict on the test set

In [16]:
# validate on the validation set
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.70      0.87      0.77       292
           2       0.53      0.26      0.35       163
           3       0.45      0.30      0.36       232
           4       0.46      0.47      0.47       421
           5       0.77      0.85      0.81       892

    accuracy                           0.66      2000
   macro avg       0.58      0.55      0.55      2000
weighted avg       0.64      0.66      0.64      2000




[[253  18   6   5  10]
 [ 64  42  37  14   6]
 [ 25  17  70  87  33]
 [ 11   2  39 197 172]
 [  8   1   5 123 755]]
accuracy 0.6585


In [17]:
y_valid_pred = pipe.predict(x_valid)
valid_df['stars'] = y_valid_pred
valid_df[["review_id", "text", "stars"]].to_csv("data/valid_pred.csv", index=False)

In [18]:
y_test_pred = pipe.predict(x_test)
test_df['stars'] = y_test_pred
test_df[["review_id", "text", "stars"]].to_csv("data/test_pred.csv", index=False)