In [21]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np

20newsgroups dataset (A collection of 20,000 news items across 20 categories)

In [5]:
dataset = fetch_20newsgroups(subset="all", random_state=3116 , download_if_missing=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Subset the dataset to only the following two categories named as ’sci.med’ and ’comp.graphics’

In [30]:
dataset_boolean_mask_1 = dataset.target == 1
dataset_boolean_mask_2 = dataset.target == 13
dataset_boolean_mask = dataset_boolean_mask_1 | dataset_boolean_mask_2

We can that the needed target values for this task are 1, 13

In [34]:
dataset_data = [data_value for data_value, mask_value in zip(dataset.data, dataset_boolean_mask) if mask_value]

In [36]:
dataset_target = dataset.target[dataset_boolean_mask]

1) Preprocessing textual data to remove punctuation, stop-words (list available via external libraries
such as NLTK and spaCy).

In [47]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize, RegexpTokenizer

In [48]:
#First getting the list of english language stopwords
stop_words = set(stopwords.words('english'))
dataset_data_no_stopwords = []

for news_index in range(len(dataset_data)) :
    filtered_news = []
    current_news = dataset_data[news_index]
    #Filtering out punctuation
    punctuation_removing_tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_tokens = punctuation_removing_tokenizer.tokenize(current_news)
    for word in word_tokens :
        if word not in stop_words :
            filtered_news.append(word)
    dataset_data_no_stopwords.append(filtered_news)

2) Implementing a bag-of-words feature representation for each text sample

In [64]:
#Firstly counting the number of unique words
unique_words_list = []
for news in dataset_data_no_stopwords :
    for word in news :
        if word not in unique_words_list :
            unique_words_list.append(word)


['From',
 'ame_0123',
 'bigdog',
 'engr',
 'arizona',
 'edu',
 'Terrance',
 'J',
 'Dishongh',
 'Subject',
 'Strain',
 'Gage',
 'Applications',
 'vivo',
 'Organization',
 'University',
 'Arizona',
 'Lines',
 '14',
 'Greeting',
 'I',
 'starting',
 'work',
 'project',
 'trying',
 'make',
 'strain',
 'gages',
 'bond',
 'bone',
 'period',
 'several',
 'months',
 'currently',
 'using',
 'hydroxyapaptite',
 'back',
 'tried',
 'M',
 'bonding',
 'Apart',
 'two',
 'application',
 'methods',
 'seem',
 'much',
 'else',
 'literature',
 'engineering',
 'background',
 'medical',
 'biological',
 'would',
 'interest',
 'ideas',
 'stimulte',
 'growth',
 'surface',
 'cortical',
 'Thanks',
 'oyur',
 'help',
 'Advance',
 'zyeh',
 'caspian',
 'usc',
 'zhenghao',
 'yeh',
 'Re',
 'Need',
 'polygon',
 'splitting',
 'algo',
 'Southern',
 'California',
 'Los',
 'Angeles',
 'CA',
 '25',
 'Distribution',
 'world',
 'NNTP',
 'Posting',
 'Host',
 'Keywords',
 'polygons',
 'clipping',
 'In',
 'article',
 '1qvq4b',
 '

In [67]:
#constructing bag-of-words matrix representation for each sample
#Initializing with zeros before counting
dataset_bag_of_words = np.zeros((len(dataset_data_no_stopwords), len(unique_words_list)))

for news_index in range(len(dataset_data_no_stopwords)) :
    for word_index in range(len(unique_words_list)) :
        dataset_bag_of_words[news_index, word_index] = dataset_bag_of_words[news_index, word_index] + dataset_data_no_stopwords[news_index].count(unique_words_list[word_index])

3) Implementing a TF-IDF feature representation for each text sample

In [78]:
import math

In this part I use the following formula for idf : idf= -log(num_documents_per_word / total_num_documents)

In [101]:
#calculating TF-IDF value for each item in the bag of words
#Number of samples for idf calculations
N = dataset_bag_of_words.shape[0]
num_documents_per_word = np.zeros((dataset_bag_of_words.shape[1],1))
dataset_tf_idf = np.zeros(dataset_bag_of_words.shape)
for news_index in range(dataset_bag_of_words.shape[0]) :
    #Making sure that expensive operations occurs once per iteration
    words_count = np.sum(dataset_bag_of_words[news_index, :])
    for word_index in range(dataset_bag_of_words.shape[1]) :
        if(num_documents_per_word[word_index] == 0) :
            num_documents_per_word[word_index] = np.count_nonzero(dataset_bag_of_words[:,word_index])
        #calculating tf by dividing count over total number of words per document
        tf = dataset_bag_of_words[news_index, word_index] / words_count
        #calculaing idf as -log(num_documents_for_word/total_num_documents)
        #Preventing -0.0 values
        if(num_documents_per_word[word_index] == N) :
            idf = 0
        else :
            idf = -math.log(num_documents_per_word[word_index]/N)
        dataset_tf_idf[news_index, word_index] = tf*idf

4) Split the dataset randomly into train/validation/test splits according to ratios 80%:10%:10%

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
train_data, rest_data, train_target, rest_target = train_test_split(dataset_tf_idf, dataset_target, test_size=0.2, random_state=3116)
val_data, test_data, val_target, test_target = train_test_split(rest_data, rest_target, test_size=0.5, random_state=3116)


Exercise 1: Implementing Naive Bayes Classifier for Text Data

In this exercise all features are numerical so a normal distribution with mean and standard deviation for each feature with each class is calculated in the training phase

In [246]:
def calculate_means(train_data, train_target) :
    classes = np.unique(train_target)
    mean_matrix = np.zeros((classes.shape[0], train_data.shape[1]))
    for class_index in range(classes.shape[0]) :
        current_class_data = train_data[train_target==classes[class_index]]
        for feature_index in range(train_data.shape[1]) :
            #Getting the mean of a certain class with a certain feature 
            mean_matrix[class_index, feature_index] = np.sum(current_class_data[:,feature_index]) / current_class_data.shape[0]
    return mean_matrix

In [247]:
def calculate_stds(train_data, train_target, mean_matrix) :
    classes = np.unique(train_target)
    std_matrix = np.zeros((classes.shape[0], train_data.shape[1]))
    for class_index in range(classes.shape[0]) :
        current_class_data = train_data[train_target==classes[class_index]]
        for feature_index in range(train_data.shape[1]) :
            #Getting the mean of a certain class with a certain feature 
            mean_vector = np.ones(train_data[:,feature_index].shape)*mean_matrix[class_index, feature_index]
            std_matrix[class_index, feature_index] = np.sqrt(np.sum(np.square(np.subtract(train_data[:, feature_index], 
                                                                                  mean_vector))) / current_class_data.shape[0])
    return std_matrix

In [248]:
def calculate_classes_probabilities(train_target, classes) :
    classes_probabilities = np.zeros(classes.shape[0])
    for class_index in range(classes.shape[0]) :
        classes_probabilities[class_index] = train_target[train_target==classes[class_index]].shape[0]/train_target.shape[0]
    return classes_probabilities

In [249]:
def learn_naive_bayes(train_data, train_target) :
    mean_matrix = calculate_means(train_data, train_target)
    std_matrix = calculate_stds(train_data, train_target, mean_matrix)
    classes = np.unique(train_target)
    classes_probabilities = calculate_classes_probabilities(train_target, classes)
    return mean_matrix, std_matrix, classes, classes_probabilities

In the testing phase we use the mean and standard deviation to approximate the probability of the new instance according to the normal distribution then the class with the highest porbability is the class label for this instance

In [255]:
#Adding epsilon here to prevent devision by zero
def calculate_probability(X, mean, std, epsilon=0.00003) :
    first_term = 1/((std+epsilon)*np.sqrt(2*math.pi))
    second_term = np.exp(-0.5*((X-mean)/(std+epsilon))**2)
    probability = first_term*second_term
    #print(X, " ", mean, " ", std)
    return probability

In [262]:
#Calculating prediction for naive bayes
def predict_naive_bayes(test_data, test_target, mean_matrix, std_matrix, classes, classes_probabilities) :
    predictions = np.zeros(test_target.shape)
    for test_instance_index in range(test_data.shape[0]) :
        test_instance = test_data[test_instance_index]
        current_instance_probabilities = np.zeros(classes.shape[0])
        for class_index in range(classes.shape[0]) :
            current_instance_probabilities[class_index] = math.log(classes_probabilities[class_index])
            for feature_index in range(test_data.shape[1]) : 
                current_feature_class_prob = calculate_probability(test_instance[feature_index], mean_matrix[class_index, feature_index], std_matrix[class_index, feature_index])
                if(current_feature_class_prob == 0) :
                    current_log_probability = 0
                else :
                    current_log_probability = math.log(current_feature_class_prob)
                current_instance_probabilities[class_index] = current_instance_probabilities[class_index]+current_log_probability
        predictions[test_instance_index] = classes[np.argmax(current_instance_probabilities)]
        #print(current_instance_probabilities)
    return predictions

In [252]:
def score_naive_bayes(predictions, test_target) :
    return (np.sum(predictions==test_target) / test_target.shape[0])*100

In [297]:
#Removing some samples to make the data balanced for the naive bayes classifier to work well
difference = train_target[train_target == 13].shape[0] - train_target[train_target==1].shape[0]
train_data_balanced = train_data[train_target==13][:-difference].copy()
train_target_balanced = train_target[train_target==13][:-difference].copy()
train_data_balanced = np.append(train_data_balanced, train_data[train_target==1], axis=0)
train_target_balanced = np.append(train_target_balanced, train_target[train_target==1], axis=0)

In [298]:
mean_matrix, std_matrix, classes, classes_probabilities = learn_naive_bayes(train_data_balanced, train_target_balanced)

In [299]:
validation_prediction = predict_naive_bayes(val_data, val_target, mean_matrix, std_matrix, classes, classes_probabilities)
print("Naive bayes validation accuracy is ", score_naive_bayes(validation_prediction, val_target))

Naive bayes validation accuracy is  95.40816326530613


In [301]:
test_prediction = predict_naive_bayes(test_data, test_target, mean_matrix, std_matrix, classes, classes_probabilities)
print("Naive bayes test accuracy is ", score_naive_bayes(test_prediction, test_target))

Naive bayes test accuracy is  94.9238578680203


Sklearn Naive bayes accuracy

In [186]:
from sklearn.naive_bayes import GaussianNB

In [303]:
gnb = GaussianNB()
y_pred = gnb.fit(train_data_balanced, train_target_balanced).predict(test_data)


In [304]:
print("Sklearn naive bayes test accuracy is ", score_naive_bayes(y_pred, test_target))

Sklearn naive bayes test accuracy is  95.43147208121827


Exercise 2: Implementing SVM Classifier via Scikit-Learn

In [105]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [111]:
#Validating different SVM kernels with different hyperparameters
penalty_linear_SVC = ['l1', 'l2']
C_parameters = [0.5, 1, 2, 10]

best_accuracy_linear_svm = 0
best_hyperparameters_linear_svm = {'penalty type' : 'l1', 'C parameter' : 0.5}

#Validation of linear SVM
for penalty_type in penalty_linear_SVC :
    for C_parameter in C_parameters :
        if(penalty_type == 'l1') :
            model = LinearSVC(penalty=penalty_type, dual=False, C=C_parameter)
        else :
            model = LinearSVC(penalty=penalty_type, C=C_parameter)
        model.fit(train_data, train_target)
        current_model_score = 100*model.score(val_data, val_target)
        if(current_model_score > best_accuracy_linear_svm) :
            best_accuracy_linear_svm = current_model_score
            best_hyperparameters_linear_svm = {'penalty_type' : penalty_type, 'C parameter' : C_parameter}
        print('current_accuracy is ', current_model_score, 'with penalty type ', penalty_type, ' and C is ', C_parameter)
        

current_accuracy is  84.6938775510204 with penalty type  l1  and C is  0.5
current_accuracy is  88.26530612244898 with penalty type  l1  and C is  1
current_accuracy is  90.81632653061224 with penalty type  l1  and C is  2
current_accuracy is  91.83673469387756 with penalty type  l1  and C is  10
current_accuracy is  97.44897959183673 with penalty type  l2  and C is  0.5
current_accuracy is  97.44897959183673 with penalty type  l2  and C is  1
current_accuracy is  97.44897959183673 with penalty type  l2  and C is  2
current_accuracy is  97.44897959183673 with penalty type  l2  and C is  10


In [112]:
print('Best validation accuracy for SVM is ', best_accuracy_linear_svm, ' with paramters : ', best_hyperparameters_linear_svm)

Best validation accuracy for SVM is  97.44897959183673  with paramters :  {'penalty_type': 'l2', 'C parameter': 0.5}


In [117]:
#Trying different values with the validation set for polynomial kernel
poly_svm_degrees = [2,3]
gamma_values = ['scale', 'auto']
C_values = [0.5, 1, 2]

best_accuracy_poly_svm = 0
best_hyperparameters_poly_svm = {'degree' : 2, 'gamma' : 'scale', 'C parameter' : 0.5}

for poly_svm_degree in poly_svm_degrees :
    for gamma_value in gamma_values :
        for C_value in C_values :
            model = SVC(kernel='poly', degree=poly_svm_degree, cache_size=6000, gamma=gamma_value, C=C_value)
            model.fit(train_data, train_target)
            current_score = 100*model.score(val_data, val_target)
            current_hyperparameters = {'degree' : poly_svm_degree, 'gamma' : gamma_value, 'C parameter' : C_value}
            if(current_score > best_accuracy_poly_svm) :
                best_accuracy_poly_svm = current_score
                best_hyperparameters_poly_svm = current_hyperparameters
            print('Current accuracy us ', current_score, ' with hyperparameters ', current_hyperparameters)

Current accuracy us  54.59183673469388  with hyperparameters  {'degree': 2, 'gamma': 'scale', 'C parameter': 0.5}
Current accuracy us  66.3265306122449  with hyperparameters  {'degree': 2, 'gamma': 'scale', 'C parameter': 1}
Current accuracy us  72.95918367346938  with hyperparameters  {'degree': 2, 'gamma': 'scale', 'C parameter': 2}
Current accuracy us  47.44897959183674  with hyperparameters  {'degree': 2, 'gamma': 'auto', 'C parameter': 0.5}
Current accuracy us  47.44897959183674  with hyperparameters  {'degree': 2, 'gamma': 'auto', 'C parameter': 1}
Current accuracy us  47.44897959183674  with hyperparameters  {'degree': 2, 'gamma': 'auto', 'C parameter': 2}
Current accuracy us  49.48979591836735  with hyperparameters  {'degree': 3, 'gamma': 'scale', 'C parameter': 0.5}
Current accuracy us  50.51020408163265  with hyperparameters  {'degree': 3, 'gamma': 'scale', 'C parameter': 1}
Current accuracy us  52.55102040816326  with hyperparameters  {'degree': 3, 'gamma': 'scale', 'C param

In [118]:
print('Best validation accuracy for polynomial SVM is ', best_accuracy_poly_svm, ' with paramters : ', best_hyperparameters_poly_svm)

Best validation accuracy for polynomial SVM is  72.95918367346938  with paramters :  {'degree': 2, 'gamma': 'scale', 'C parameter': 2}


In [122]:
#Measuring accuracies for rbf and sigmoid kernels with different C parameters
kernels = ['rbf', 'sigmoid']
C_values = [0.5, 1, 2]
gamma_values = ['scale', 'auto']

best_accuracy_other_kernels_svm = 0
best_hyperparameters_other_kernels_svm = {'kernel' : 'rbf', 'gamma' : 'scale', 'C parameter' : 0.5}


for kernel in kernels :
    for C_value in C_values :
        for gamma in gamma_values :
            model = SVC(kernel=kernel, gamma=gamma, cache_size=6000, C=C_value)
            model.fit(train_data, train_target)
            current_score = 100*model.score(val_data, val_target)
            current_hyperparameters = {'kernel' : kernel, 'gamma' : gamma, 'C parameter' : C_value}
            if(current_score > best_accuracy_other_kernels_svm) :
                best_accuracy_other_kernels_svm = current_score
                best_hyperparameters_other_kernels_svm = current_hyperparameters
            print('Current accuracy us ', current_score, ' with hyperparameters ', current_hyperparameters)

Current accuracy us  88.26530612244898  with hyperparameters  {'kernel': 'rbf', 'gamma': 'scale', 'C parameter': 0.5}
Current accuracy us  47.44897959183674  with hyperparameters  {'kernel': 'rbf', 'gamma': 'auto', 'C parameter': 0.5}
Current accuracy us  91.83673469387756  with hyperparameters  {'kernel': 'rbf', 'gamma': 'scale', 'C parameter': 1}
Current accuracy us  47.44897959183674  with hyperparameters  {'kernel': 'rbf', 'gamma': 'auto', 'C parameter': 1}
Current accuracy us  92.85714285714286  with hyperparameters  {'kernel': 'rbf', 'gamma': 'scale', 'C parameter': 2}
Current accuracy us  47.44897959183674  with hyperparameters  {'kernel': 'rbf', 'gamma': 'auto', 'C parameter': 2}
Current accuracy us  97.44897959183673  with hyperparameters  {'kernel': 'sigmoid', 'gamma': 'scale', 'C parameter': 0.5}
Current accuracy us  47.44897959183674  with hyperparameters  {'kernel': 'sigmoid', 'gamma': 'auto', 'C parameter': 0.5}
Current accuracy us  97.44897959183673  with hyperparameters

In [123]:
print('Best validation accuracy for other kernels SVM is ', best_accuracy_other_kernels_svm, ' with paramters : ', best_hyperparameters_other_kernels_svm)

Best validation accuracy for other kernels SVM is  97.44897959183673  with paramters :  {'kernel': 'sigmoid', 'gamma': 'scale', 'C parameter': 0.5}


We can see that linear and sigmoid kernels achieve best validation accuracy of 97.44

2) Report the test-set accuracy.

In [124]:
#Testing on best hyperparameters with linear
linear_model = LinearSVC(penalty='l2', C=0.5)
linear_model.fit(train_data, train_target)
print('Test accuracy with best linear svm is ', linear_model.score(test_data, test_target)*100)

Test accuracy with best linear svm is  98.47715736040608


In [125]:
#Testing on best hyperparameters with sigmoid kernel
sigmoid_kernel_model = SVC(kernel='sigmoid', gamma='scale', C=0.5)
sigmoid_kernel_model.fit(train_data, train_target)
print('Test accuracy with best sigmoid kernel svm is ', sigmoid_kernel_model.score(test_data, test_target)*100)

Test accuracy with best sigmoid kernel svm is  98.47715736040608


Finally we can see sigmoid and linear kernels give similar validation and test accuracy