NOTE : A lot of code is taken from the lab session on naive bayes.

In [10]:
import pandas as pd
import numpy as np
from scipy import sparse
from collections import Counter

In [11]:
punc_list = "!\-\\\"#$%&'(*+,./:;<=>?@[\]^_`{|}~)"

In [12]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [13]:
#remove punctuations
train['Abstract'] = train['Abstract'].str.replace("["+punc_list+"\\n"+ "]",' ').str.lower().str.replace('\\', ' ')
test['Abstract'] = test['Abstract'].str.replace("["+punc_list+"\\n"+ "]",' ').str.lower().str.replace('\\', ' ')

In [14]:
#find unique words
unique_words = pd.Series(list(set(" ".join(train['Abstract']).split())))

In [15]:
#remove one-letter-long words
unique_words = unique_words.loc[unique_words.str.len()>=2]
unique_words.reset_index(drop=True, inplace = True)

In [17]:
#initialize sparse matrix
nrows_train = len(train)
ncols = len(unique_words)
sparse_train = sparse.lil_matrix((nrows_train, ncols), dtype=np.int16)

In [18]:
#fill up the matrix by using a counter to count the word occurences. Note that this will be turned into 0/1 later (impicitly)
for i, row in enumerate(train['Abstract'].str.split().values):
    if i%100==0:
        print(str(i)+"/"+str(nrows_train))
    dic = Counter(row)
    for j, word in enumerate(unique_words):
        if word in dic:
            sparse_train[i, j] = dic[word]



In [19]:
#test set sparse matrix
nrows_test = len(test)
sparse_test = sparse.lil_matrix((nrows_test, ncols), dtype=np.int16)

In [20]:
for i, row in enumerate(test['Abstract'].str.split().values):
    if i%500==0:
        print(str(i)+"/"+str(nrows_test))
    dic = Counter(row)
    for j, word in enumerate(unique_words):
        if word in dic:
            sparse_test[i, j] = dic[word]



In [21]:
X_train = pd.DataFrame.sparse.from_spmatrix(sparse_train).values
y_train = train["Category"].values
X_test = pd.DataFrame.sparse.from_spmatrix(sparse_test).values

In [22]:
class BernoulliMaxLikelihood:
    def __init__(self, n_dims):
        self.n_dims = n_dims

    # The function gets the probabilities of an occurence or an absence of a word given train data
    def train(self, train_data):
        self.p0 = (((train_data==0).sum(axis=0)+1)/(len(train_data)+2))
        self.p1 = (((train_data>=1).sum(axis=0)+1)/(len(train_data)+2))
        
    # Returns a vector of size nb. of test ex. containing the log probabilities of each test example under the model.
    # exemple test
    def loglikelihood(self, test_data):
        x = test_data
        x = np.clip(x, 0, 1)
        notx = 1-x
        log_prob = np.sum(x*np.log(self.p1), axis=1)+np.sum(notx*np.log(self.p0), axis=1)
        return log_prob

In [23]:
class BayesClassifier:
    def __init__(self, maximum_likelihood_models, priors):
        self.maximum_likelihood_models = maximum_likelihood_models
        self.priors = priors
        if len(self.maximum_likelihood_models) != len(self.priors):
            print('The number of ML models must be equal to the number of priors!')
        self.n_classes = len(self.maximum_likelihood_models)

    # Returns a matrix of size number of test ex. times number of classes containing the log
    # probabilities of each test example
    def loglikelihood(self, test_data):

        log_pred = np.zeros((test_data.shape[0], self.n_classes))

        for i in range(self.n_classes):
            print(str(i)+"/"+str(15))
            # Here, we will have to use maximum_likelihood_models[i] and priors to fill in
            # each column of log_pred 
            result = self.maximum_likelihood_models[i].loglikelihood(test_data)
            log_pred[:, i] = result + np.log(self.priors[i])

        return log_pred

In [24]:
#initializing the model
dim = X_train.shape[1]
models = []
total_num = []
for i, category in enumerate(train["Category"].unique()):
    cat = X_train[train["Category"]==category]
    model = BernoulliMaxLikelihood(dim)
    model.train(cat)
    models.append(model)
    

In [25]:
#getting the priors
model_ml = models
total_num = len(train)
priors = train["Category"].value_counts().values/len(train)

In [26]:
#initialize the classifier
classifier = BayesClassifier(model_ml, priors)

In [27]:
# Returns a number between 0 and 1 representing the accuracy of the model on the test_inputs
#Not actually used, but was used for testing purposes
def get_accuracy(data, labels, values):
    # Calculate the log-probabilities according to our model
    log_prob = classifier.loglikelihood(data)
    # Predict labels
    print(log_prob)
    classes_pred = values[log_prob.argmax(1)]
    print(classes_pred)
    # Calculate the accuracy by comparing the predicted labels with the actual labels
    acc = np.mean(classes_pred == labels)
    return acc

In [28]:
#gets predictions
def get_predictions(data, values):
    log_prob = classifier.loglikelihood(data)
    classes_pred = values[log_prob.argmax(1)]
    return classes_pred

In [30]:
test_preds = get_predictions(X_test, train["Category"].unique())



In [31]:
results = test[["Id"]].copy()
results["Category"] = test_preds
results.to_csv("naive_to_submit2.csv", index=False)

Unnamed: 0,Id,Category
0,0,stat.ML
1,1,astro-ph.SR
2,2,astro-ph.SR
3,3,math.AP
4,4,cs.LG
...,...,...
14995,14995,astro-ph
14996,14996,physics.optics
14997,14997,astro-ph
14998,14998,gr-qc
