In [32]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/comp4332-project1

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/comp4332-project1


In [33]:
import graphviz
import os
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification, load_iris, make_hastie_10_2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
nltk.download('stopwords')
nltk.download('punkt')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
def load_data(split_name='train', columns=['text', 'stars']):
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        df = df.loc[:,columns]
        print("succeed!")
        return df
    except:
        print("Failed, then try to ")
        print(f"select all columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        return df

In [35]:
train_df = load_data('train', columns=['text','stars'])
val_df = load_data('valid', columns=['text','stars'])
test_df = load_data('test')

select [text, stars] columns from the train split
succeed!
select [text, stars] columns from the valid split
succeed!
select [text, stars] columns from the test split
Failed, then try to 
select all columns from the test split


In [36]:
def lower(s):
    """
    :param s: a string.
    return a string with lower characters
    Note that we allow the input to be nested string of a list.
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: 'text mining is to identify useful information.'
    """
    if isinstance(s, list):
        return [lower(t) for t in s]
    if isinstance(s, str):
        return s.lower()
    else:
        raise NotImplementedError("unknown datatype")


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)


def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n= 3):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results

def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

def get_onehot_vector(feats, feats_dict):
    """
    :param data: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector


def get_feats_dict(feats, min_freq=-1, max_freq=-1, max_size=-1):
    """
    :param data: a list of features, type: list(list)
    :param min_freq: the lowest fequency that the fequency of a feature smaller than it will be filtered out, type: int
    :param max_freq: the highest fequency that the fequency of a feature larger than it will be filtered out, type: int
    :param max_size: the max size of feature dict, type: int
    return a feature dict that maps features to indices, sorted by frequencies
    # Counter document: https://docs.python.org/3.6/library/collections.html#collections.Counter
    """
    # count all features
    feat_cnt = Counter(feats) # ["text", "text", "mine"] --> {"text": 2, "mine": 1}
    if max_size > 0 and min_freq == -1 and max_freq == -1:
        valid_feats = [f for f, cnt in feat_cnt.most_common(max_size)]
    else:
        valid_feats = list()
        for f, cnt in feat_cnt.most_common():
            if (min_freq == -1 or cnt >= min_freq) and \
                (max_freq == -1 or cnt <= max_freq):
                valid_feats.append(f)
    if max_size > 0 and len(valid_feats) > max_size:
        valid_feats = valid_feats[:max_size]        
    print("Size of features:", len(valid_feats))
    
    # build a mapping from features to indices
    feats_dict = dict(zip(valid_feats, range(len(valid_feats))))
    return feats_dict
    
def get_index_vector(feats, feats_dict, max_len):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    :param feats: a list of features, type: list
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(max_len, dtype=np.int64)
    for i, f in enumerate(feats):
        if i == max_len:
            break
        # get the feature index, return 1 (<unk>) if the feature is not existed
        f_idx = feats_dict.get(f, 1)
        vector[i] = f_idx
    return vector

In [37]:
train_df['tokens'] = train_df['text'].map(tokenize).map(filter_stopwords).map(lower)
x_train = train_df['text']
y_train = train_df['stars']

## Some traditional machine learning approaches
Note that Logistic Regression is implemented in another .ipynb file.

SVM Classification

In [38]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'})
model = SVC(C=1.0, kernel='sigmoid', degree=3 , gamma='auto', coef0=0.0,
          shrinking=True,
          probability=False, tol=0.001, cache_size=200, class_weight=None,
          verbose=False,
          max_iter=-1, decision_function_shape='ovr', random_state=None)
steps = [('tfidf', tfidf),('model', model)]
pipe = Pipeline(steps)
# print(pipe)
pipe.fit(x_train, y_train)
x_valid = val_df['text']
y_valid = val_df['stars']
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.26      1.00      0.41       517
           2       0.00      0.00      0.00       278
           3       0.00      0.00      0.00       344
           4       0.00      0.00      0.00       427
           5       0.00      0.00      0.00       434

    accuracy                           0.26      2000
   macro avg       0.05      0.20      0.08      2000
weighted avg       0.07      0.26      0.11      2000




[[517   0   0   0   0]
 [278   0   0   0   0]
 [344   0   0   0   0]
 [427   0   0   0   0]
 [434   0   0   0   0]]
accuracy 0.2585


  _warn_prf(average, modifier, msg_start, len(result))


KNN Classification

In [39]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'})
model = KNeighborsClassifier(n_neighbors=6, weights='uniform', algorithm='auto',
                           leaf_size=50, p=2, metric='minkowski', 
                           metric_params=None, n_jobs=1)
steps = [('tfidf', tfidf),('model', model)]
pipe = Pipeline(steps)
# print(pipe)
pipe.fit(x_train, y_train)
x_valid = val_df['text']
y_valid = val_df['stars']
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.44      0.83      0.58       517
           2       0.20      0.18      0.19       278
           3       0.28      0.23      0.25       344
           4       0.36      0.28      0.31       427
           5       0.61      0.22      0.32       434

    accuracy                           0.39      2000
   macro avg       0.38      0.35      0.33      2000
weighted avg       0.40      0.39      0.36      2000




[[429  33  24  19  12]
 [149  51  48  28   2]
 [119  66  79  69  11]
 [117  64  91 118  37]
 [152  45  44  97  96]]
accuracy 0.3865


## Tree Methos

We will try some tree method in the following part of the code:

Decision Tree Classification

In [48]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'})
model = DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=None,
                             min_samples_split=2,min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0,max_features=None,
                             random_state=None,max_leaf_nodes=None,
                             min_impurity_decrease=0.0,min_impurity_split=1e-07,
                             class_weight=None, presort=False)
steps = [('tfidf', tfidf),('model', model)]
pipe = Pipeline(steps)
# print(pipe)
pipe.fit(x_train, y_train)
x_valid = val_df['text']
y_valid = val_df['stars']
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))



              precision    recall  f1-score   support

           1       0.56      0.58      0.57       517
           2       0.26      0.23      0.24       278
           3       0.27      0.29      0.28       344
           4       0.32      0.31      0.31       427
           5       0.46      0.44      0.45       434

    accuracy                           0.39      2000
   macro avg       0.37      0.37      0.37      2000
weighted avg       0.39      0.39      0.39      2000




[[301  60  59  49  48]
 [ 86  63  63  42  24]
 [ 73  60 101  73  37]
 [ 47  41  95 134 110]
 [ 35  22  61 126 190]]
accuracy 0.3945


Adaboost Classification

In [44]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'})
model = AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1e-1, n_estimators=100, random_state=2)
steps = [('tfidf', tfidf),('model', model)]
pipe = Pipeline(steps)
# print(pipe)
pipe.fit(x_train, y_train)
x_valid = val_df['text']
y_valid = val_df['stars']
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.56      0.84      0.67       517
           2       0.50      0.00      0.01       278
           3       0.37      0.43      0.40       344
           4       0.43      0.47      0.45       427
           5       0.63      0.54      0.58       434

    accuracy                           0.51      2000
   macro avg       0.50      0.46      0.42      2000
weighted avg       0.51      0.51      0.47      2000




[[432   1  42  26  16]
 [138   1  95  33  11]
 [ 92   0 148  91  13]
 [ 55   0  77 201  94]
 [ 49   0  34 118 233]]
accuracy 0.5075


Gradient Boosting Classification (GBDT)

In [45]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'})
model = GradientBoostingClassifier(loss='deviance',learning_rate=0.1,n_estimators=100,subsample=1.0,criterion='friedman_mse',
                                 min_samples_split=2,min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0,max_depth=3,min_impurity_decrease=0.0,min_impurity_split=None,
                                 init=None,random_state=None,max_features=None,verbose=0,max_leaf_nodes=None,
                                 warm_start=False,presort='auto',
                                 validation_fraction=0.1,n_iter_no_change=None, tol=0.0001) 
steps = [('tfidf', tfidf),('model', model)]
pipe = Pipeline(steps)
# print(pipe)
pipe.fit(x_train, y_train)
x_valid = val_df['text']
y_valid = val_df['stars']
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))



              precision    recall  f1-score   support

           1       0.66      0.84      0.74       517
           2       0.37      0.14      0.20       278
           3       0.43      0.46      0.45       344
           4       0.44      0.45      0.45       427
           5       0.64      0.64      0.64       434

    accuracy                           0.55      2000
   macro avg       0.51      0.51      0.49      2000
weighted avg       0.53      0.55      0.53      2000




[[433  21  30  17  16]
 [113  38  79  37  11]
 [ 56  31 159  79  19]
 [ 28   7  85 193 114]
 [ 25   5  17 108 279]]
accuracy 0.551


Random Forest:

In [46]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'})
model = RandomForestClassifier(criterion='entropy',n_jobs=-1)
steps = [('tfidf', tfidf),('model', model)]
pipe = Pipeline(steps)
# print(pipe)
pipe.fit(x_train, y_train)
x_valid = val_df['text']
y_valid = val_df['stars']
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.54      0.92      0.68       517
           2       0.36      0.01      0.03       278
           3       0.43      0.30      0.35       344
           4       0.44      0.42      0.43       427
           5       0.61      0.66      0.63       434

    accuracy                           0.52      2000
   macro avg       0.48      0.46      0.43      2000
weighted avg       0.49      0.52      0.47      2000




[[476   1  15   7  18]
 [162   4  51  44  17]
 [115   5 103  96  25]
 [ 66   0  60 179 122]
 [ 55   1  11  81 286]]
accuracy 0.524


Xgboost Classification

In [47]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'})
model = XGBClassifier(learning_rate=0.05, n_estimators=100, silent=True, 
                    objective='multi:softmax', num_class=5, booster='gbtree', n_jobs=-1,
                    subsample=1, colsample_bytree=1, max_depth=12,
                    colsample_bylevel=1, reg_alpha=2, reg_lambda=2, scale_pos_weight=1,
                    base_score=0.5,
                    random_state=0, seed=None, missing=None)
steps = [('tfidf', tfidf),('model', model)]
pipe = Pipeline(steps)
# print(pipe)
pipe.fit(x_train, y_train)
x_valid = val_df['text']
y_valid = val_df['stars']
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.65      0.87      0.74       517
           2       0.35      0.12      0.18       278
           3       0.42      0.42      0.42       344
           4       0.47      0.50      0.49       427
           5       0.67      0.62      0.64       434

    accuracy                           0.56      2000
   macro avg       0.51      0.51      0.49      2000
weighted avg       0.53      0.56      0.53      2000




[[451  22  19  12  13]
 [121  33  85  29  10]
 [ 56  31 143  94  20]
 [ 40   5  74 215  93]
 [ 31   4  21 108 270]]
accuracy 0.556
