# Tutorial 4 - Fundamental ML Algorithms Part II

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from scipy import sparse
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import f1_score
from collections import Counter

In [17]:
dataPath = './dataset/'
writePath ='./generated/'

df_iris = pd.read_csv(dataPath + 'iris.csv')

regex = re.compile('[^\w\s]')
FEATURES = 10000


data_types = ['train', 'valid', 'test']

### 1. Support Vector Machines

### 2. Ensemble Methods

In [4]:
''' Pre-processing function: lowercase, remove punctuation. Returns list of lists (of clean lines), ratings ''' 
def pre_process(filePath):
    with open(filePath, 'r', encoding="utf-8") as f:
        lines = f.readlines()
        
        reviews, ratings = [], []
        for l in lines:
            splitted = l.split('\t')
            ratings.append(int(splitted[1].strip()))
            reviews.append(regex.sub('', splitted[0].strip()).lower())

    return reviews, ratings

In [12]:
''' Function returns takes in list of lists (of lines), returns list of n most freq. words '''
def top_n_words(l2d, n):
    count = Counter([word for line in l2d for word in line.split()]).most_common(n)
    top_features = [word[0] for word in count]

    return top_features, count

In [10]:
# Pre-process train, val, test sets of yelp data
yelp_tr_x, yelp_tr_rate = pre_process(dataPath + 'yelp-train.txt')
yelp_val_x, yelp_val_rate = pre_process(dataPath + 'yelp-valid.txt')
yelp_test_x, yelp_test_rate = pre_process(dataPath + 'yelp-test.txt')

# Convert to dictionaries
yelp_text = {'train': yelp_tr_x, 'valid': yelp_val_x, 'test': yelp_test_x}
yelp_ratings = {'train': yelp_tr_rate, 'valid': yelp_val_rate, 'test': yelp_test_rate}

In [13]:
# Return top 10 000 features for dataset
yelp_vocab, yelp_count = top_n_words(yelp_tr_x, FEATURES)

In [15]:
# Function to write vocab.txt files given top 10 000 words in data-sets
def vocab_to_txt(vocab_count, filePath, fileName):
    write_path = filePath + fileName
    yelp_dict = {}
    vocab_index = {}
    f = open(write_path, 'w')
    for i in range(len(vocab_count)):
        yelp_dict[vocab_count[i][0]] = vocab_count[i][1]
        vocab_index[vocab_count[i][0]] = i
        f.write(vocab_count[i][0] + ' ' + str(i)+ ' ' + str(vocab_count[i][1]) + '\n')
    f.close()

    return yelp_dict, vocab_index

In [18]:
vocab_dict, vocab_indices = vocab_to_txt(yelp_count, writePath,'yelp-vocab.txt')

In [23]:
# Function that converts pre-processed text to binary, frequency bag-of-words representations
def convert_bow(text, ratings):
    bin = {}
    freq = {}

    vectorizer = CountVectorizer(vocabulary=yelp_vocab)
    vectorizer_bin = CountVectorizer(vocabulary=yelp_vocab, binary=True)

    for type in data_types:
        v_freq = np.array(normalize(vectorizer.fit_transform(text[type]).todense()))
        v_bin = sparse.csr_matrix(np.array(vectorizer_bin.fit_transform(text[type]).todense()))
        freq[type] = [v_freq, ratings[type]]
        bin[type] = [v_bin, ratings[type]]

    # return bin, freq
    return bin, freq

In [28]:
yelp_bin, yelp_freq = convert_bow(yelp_text, yelp_ratings)

In [29]:
# Function that returns F1-measure for benchmark classifiers, i.e. majority and random classifiers

def ref_clf(clf_list, dataset):
    for clf in clf_list:
        clf[0].fit(dataset['train'][0], dataset['train'][1])
        pred = clf[0].predict(dataset['test'][0])
        print(clf[1] + ' F1-measure: ' + str(f1_score(yelp_test_rate, pred, average='micro')))
        print('\n')
    return

In [30]:
# Function that trains, evaluate classifier and returns best parameters, and F1-measure on train, valid and test sets

def train_clf(dataset, classifier, parameters):
    if parameters != None:
        classifier = tune_hyper_params(classifier, dataset, params)

        train_val_feat = sparse.vstack([dataset['train'][0], dataset['valid'][0]])
        train_val_ratings = np.concatenate((dataset['train'][1], dataset['valid'][1]))

        classifier.fit(train_val_feat, train_val_ratings)

    else:
        classifier.fit(dataset['train'][0], dataset['train'][1])

    pred_train = f1_score(dataset['train'][1], classifier.predict(dataset['train'][0]), average='micro')
    pred_val = f1_score(dataset['valid'][1], classifier.predict(dataset['valid'][0]), average='micro')
    pred_test = f1_score(dataset['test'][1], classifier.predict(dataset['test'][0]), average='micro')

    f1 = {'F1-measure Train': pred_train, 'F1-measure Valid': pred_val, 'F1-measure Test': pred_test}
    best_param = None if parameters == None else classifier.best_params_

    return f1, best_param

In [35]:
# Linear SVM
params = [{'max_iter': [100 * i for i in range(10)]}]
f1_SVM, best_params = train_clf(yelp_bin, LinearSVC(), None)
print('Linear SVM')
print(f1_SVM)
print('Best params - ' + str(best_params))

Linear SVM
{'F1-measure Train': 0.99642857142857144, 'F1-measure Valid': 0.45900000000000002, 'F1-measure Test': 0.44500000000000001}
Best params - None


### 2.1 Random Forests

In [38]:
f1_rand_forest, best_params = train_clf(yelp_bin, RandomForestClassifier(), None)
print('Random Forest')
print(f1_rand_forest)
print('Best params - ' + str(best_params))

Random Forest
{'F1-measure Train': 0.99142857142857144, 'F1-measure Valid': 0.374, 'F1-measure Test': 0.39149999999999996}
Best params - None


### 2.2 Gradient Boosted Decision Trees

In [39]:
f1_grad_boosted, best_params = train_clf(yelp_bin, GradientBoostingClassifier(), None)
print('Random Forest')
print(f1_grad_boosted)
print('Best params - ' + str(best_params))

Random Forest
{'F1-measure Train': 0.66085714285714281, 'F1-measure Valid': 0.45200000000000001, 'F1-measure Test': 0.47599999999999998}
Best params - None
