In [1]:
from sklearn import datasets
training20=datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [2]:
#convert text to feature vectors
import sklearn

count_vect = sklearn.feature_extraction.text.CountVectorizer()
X_train_counts = count_vect.fit_transform(training20.data)
X_train_counts.shape

(11314, 101631)

TFIDF

In [3]:
from scipy.sparse.lil import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [4]:
import numpy as np
twenty_test = datasets.fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
X_new_counts = count_vect.transform(twenty_test.data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)


In [5]:
import numpy as np
import itertools
%matplotlib inline
import matplotlib.pyplot as plt
np.random.seed(1234)


class MultinomialNaiveBayes:
    
    def __init__(self):
        return
    
    def fit(self, x, y):
      train = x
      doc, features = train.shape
      # Prior
      unique, counts = np.unique(y, return_counts=True)
      prior = dict(zip(unique, counts))
      vocab = {}
      hm = {}
      for i in prior:
        prior[i] /= doc
        hm[i] = {}
        vocab[i] = 0
      # Conditional probabilities
      # Create a dictionary with every class in it
      for j in range(len(train)):
        vocab[y[j]] += np.count_nonzero(train[j])
        if j % 500 == 0:
          print(j)
        for k in range(features):
          #smoothing
          if k not in hm[y[j]]:
            hm[y[j]][k] = 1
          hm[y[j]][k] += (train[j][k])
      for i in hm:
        for j in hm[i]:
          hm[i][j] /= (doc+vocab[i])
      self.hm = hm
      self.prior = prior

      return self

def logsumexp(Z):                                                # dimension C x N
    Zmax = np.max(Z,axis=0)[None,:]                              # max over C
    log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
    return log_sum_exp

def predict(self, xt):
    from math import log, exp
    result = []
    first = list(self.hm.keys())[0]
    # for each line to predict
    for i in range(len(xt)):
      # for each class it could be
      tmp = first
      best = float('-inf')
      for j in self.hm:
        # calculate probability of that line being that class
        # and then choose the class with highest probability
        prob = log(self.prior[j])
        for k in range(len(xt[i])):
          if xt[i][k] != 0.0:
            prob += log(xt[i][k]) + log(self.hm[j][k])
        if best < prob:
          best = prob
          tmp = j
      result.append(tmp)
    res = np.array(result)
    return res                                      # dimension N x C

MultinomialNaiveBayes.predict = predict

Softmax regression for 20newsgroup dataset with different parameters

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

softmax=LogisticRegression()
#softmax.fit(X_train_tfidf, training20.target)
#predicted=softmax.predict(X_new_tfidf)
#np.mean(predicted==twenty_test.target)


solvers = ['newton-cg', 'lbfgs', 'liblinear']
#penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)
grid_search = GridSearchCV(estimator=softmax, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_tfidf, training20.target)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))




Best: 0.722821 using {'C': 100, 'solver': 'liblinear'}
0.716369 (0.002033) with: {'C': 100, 'solver': 'newton-cg'}
0.715927 (0.002298) with: {'C': 100, 'solver': 'lbfgs'}
0.722821 (0.004243) with: {'C': 100, 'solver': 'liblinear'}
0.714867 (0.000354) with: {'C': 10, 'solver': 'newton-cg'}
0.714955 (0.000088) with: {'C': 10, 'solver': 'lbfgs'}
0.720170 (0.000177) with: {'C': 10, 'solver': 'liblinear'}
0.692505 (0.000972) with: {'C': 1.0, 'solver': 'newton-cg'}
0.692593 (0.001061) with: {'C': 1.0, 'solver': 'lbfgs'}
0.691621 (0.001679) with: {'C': 1.0, 'solver': 'liblinear'}
0.565229 (0.003977) with: {'C': 0.1, 'solver': 'newton-cg'}
0.565229 (0.003977) with: {'C': 0.1, 'solver': 'lbfgs'}
0.565141 (0.004950) with: {'C': 0.1, 'solver': 'liblinear'}
0.429291 (0.003977) with: {'C': 0.01, 'solver': 'newton-cg'}
0.429203 (0.004066) with: {'C': 0.01, 'solver': 'lbfgs'}
0.448824 (0.003005) with: {'C': 0.01, 'solver': 'liblinear'}


sentiment dataset

In [9]:
#import sentiment140 dataset into pandas
import pandas as pd
import csv
col=["sentiment", "tweetID", "date", "query", "user", "text"]
df=pd.read_csv("/content/training.1600000.processed.noemoticon.csv", 
               encoding='latin-1', engine='python', error_bad_lines=False, names=col)
from sklearn.utils import shuffle
df = shuffle(df,random_state=0)
sentiment_df = df["sentiment"]
text_df = df["text"]
sentiment = sentiment_df[:10000].to_numpy()
text = text_df[:10000].to_numpy()

#convert text to feature vectors
import sklearn
# from scipy.sparse.lil import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer

# count_vect2 = sklearn.feature_extraction.text.CountVectorizer(stop_words='english',ngram_range=(1,1))
count_vect2 = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
X_train_counts2 = count_vect2.fit_transform(text)
# X_train_counts.shape

tfidf_transformer2 = TfidfTransformer()
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)
# X_train_tfidf.shape
X_train_tfidf2

#import sentiment140 TEST set into pandas
import pandas as pd
import csv
col=["sentiment", "tweetID", "date", "query", "user", "text"]
df2=pd.read_csv("/content/new_testdata.manual.2009.06.14.csv", 
               encoding='latin-1', engine='python', error_bad_lines=False, names=col)


test_text = df2["text"]
test_sentiment = df2["sentiment"]

t_text = test_text.to_numpy()
t_sentiment = test_sentiment.to_numpy()

import numpy as np
X_new_counts2 = count_vect2.transform(t_text)
X_new_tfidf2 = tfidf_transformer2.transform(X_new_counts2)



  exec(code_obj, self.user_global_ns, self.user_ns)


  exec(code_obj, self.user_global_ns, self.user_ns)


softmax regression for sentiment dataset

In [11]:
from sklearn.linear_model import LogisticRegression
softmax2=LogisticRegression()
softmax2.fit(X_train_tfidf2, sentiment)
p=softmax2.predict(X_new_tfidf2)
np.mean(p == t_sentiment)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

#softmax=LogisticRegression()
#softmax.fit(X_train_tfidf, training20.target)
#predicted=softmax.predict(X_new_tfidf)
#np.mean(predicted==twenty_test.target)


solvers = ['newton-cg', 'lbfgs', 'liblinear']
#penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=0)
grid_search = GridSearchCV(estimator=softmax2, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_tfidf2, sentiment)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.740000 using {'C': 1.0, 'solver': 'newton-cg'}
0.720200 (0.002800) with: {'C': 100, 'solver': 'newton-cg'}
0.720100 (0.002700) with: {'C': 100, 'solver': 'lbfgs'}
0.720200 (0.002800) with: {'C': 100, 'solver': 'liblinear'}
0.733900 (0.000300) with: {'C': 10, 'solver': 'newton-cg'}
0.733800 (0.000400) with: {'C': 10, 'solver': 'lbfgs'}
0.733800 (0.000400) with: {'C': 10, 'solver': 'liblinear'}
0.740000 (0.001600) with: {'C': 1.0, 'solver': 'newton-cg'}
0.739900 (0.001500) with: {'C': 1.0, 'solver': 'lbfgs'}
0.739800 (0.001200) with: {'C': 1.0, 'solver': 'liblinear'}
0.714000 (0.004000) with: {'C': 0.1, 'solver': 'newton-cg'}
0.714000 (0.004000) with: {'C': 0.1, 'solver': 'lbfgs'}
0.714400 (0.004400) with: {'C': 0.1, 'solver': 'liblinear'}
0.665700 (0.004300) with: {'C': 0.01, 'solver': 'newton-cg'}
0.665700 (0.004300) with: {'C': 0.01, 'solver': 'lbfgs'}
0.675600 (0.004000) with: {'C': 0.01, 'solver': 'liblinear'}
