# General pipeline for project 1
This is an example pipeline showing you how to  
(1) Load the provided data;  
(2) Train models on the train set, and use the validation set to evaluate your model performance;  
(3) Generate predictions (pred.csv) on the test set, which is ready for submission.

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

### (1) Loading data
The following code shows how to load the datasets for this project.  
Among which, we do not release the labels (the "stars" column) for the test set. You may evaluate your trained model on the validation set instead.

However, your submitted predictions (``pred.csv``) should be generated on the test set.

In [None]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [None]:
train_df = load_data('train', columns=['review_id', 'text', 'stars'])
valid_df = load_data('valid', columns=['review_id', 'text', 'stars'])
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'])

In [None]:
# test_df.columns
print(train_df.columns)
print(valid_df.columns)
print(test_df.columns)
test_df

### (2) Training and validating 
The following example shows you how to train your model using the train set, and evaluate on the validation set.  
As an example, we only use the text data for training. Feel free to use other columns in your implementation.  

The model performance on the validation set can be roughly regarded as your models final performance, so we can use it to search for optimal hyper-parameters.

In [None]:
# Prepare the data.
# As an example, we only use the text data. 
x_train = np.array(train_df['text'])
y_train = np.array(train_df['stars'])
  
x_valid = np.array(valid_df['text'])
y_valid = np.array(valid_df['stars'])

x_test = np.array(test_df['text'])

 You can use the valid data to choose the hyperparameters.
As an example, you can decide which value of C (1 or 100) is better by evaluating on the valid data.

# Grid search

In [None]:
# # build the first linear model with TFIDF feature
tfidf = TfidfVectorizer()
svm = SVC()
steps = [('tfidf', tfidf), ('svm', svm)]
pipe = Pipeline(steps)

In [None]:
# define the parameters for grid search
param_grid = {
    'tfidf__max_features': [1000, 2000, 5000, 10000, 20000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'svm__C': [0.1, 0.5, 1, 2, 4],
    'svm__kernel': ['linear', 'rbf']
}

# perform grid search with cross-validation
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=1, n_jobs=14)
grid_search.fit(x_train, y_train)

# print the best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

# Train with best model

In [None]:
# define the pipeline with the best parameters
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
svm = SVC(C=2, kernel='rbf')
pipe = Pipeline(steps=[('tfidf', tfidf), ('svm', svm)])

# fit the pipeline on the training data
pipe.fit(x_train, y_train)

In [None]:
bagging = BaggingClassifier(base_estimator=pipe, n_estimators=1, max_samples=0.5, max_features=0.5, n_jobs=14)

# Evaluate on the validation set and predict on the test set

In [None]:
# validate on the validation set
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

In [None]:
y_valid_pred = pipe.predict(x_valid)
valid_df['stars'] = y_valid_pred
valid_df[["review_id", "text", "stars"]].to_csv("data/valid_pred.csv", index=False)

In [None]:
y_test_pred = pipe.predict(x_test)
test_df['stars'] = y_test_pred
test_df[["review_id", "text", "stars"]].to_csv("data/test_pred.csv", index=False)