In [49]:
data ="C:/Users/kunal/OneDrive/Desktop/20_newsgroups"

In [50]:
from sklearn.datasets import load_files
content = load_files(data, encoding="utf-8", decode_error="replace")

In [6]:
import numpy as np
from sklearn import model_selection

In [8]:
#split of content data and target fields
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(content.data, content.target)

In [9]:
len(X_test), len(X_train)

(5000, 14997)

In [10]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import re

In [15]:
#this function will return top 500 frequent words which will be chosen as features of the dataset
def Features(X_train):
    #dictionary to obtain feature_set
    feature_set ={}
    stop_words = set(stopwords.words('english')) 

    for doc in X_train:
        #remove all stopwords, numbers, special symbols from each documents
        removedNumber = re.sub(r'[0-9]+', ' ', doc)
        cleanString = re.sub(r"[^a-zA-Z0-9]+", ' ', removedNumber)
        clean = re.sub(r'\b\w{1,3}\b', ' ', cleanString)
        
        #tokenized each document
        word_tokens = word_tokenize(clean) 
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        for word in filtered_sentence:
            
            #check whether the word is present in dict and increase its freq by 1 if already present
            #intialize with 1 if not present
            if word not in feature_set:
                feature_set[word] = 1
            else:
                feature_set[word] += 1
    
    #sort the dictionary in reverse order to get most frequent words
    data = []
    features = []
    for keys, values in feature_set.items():
        data.append((values,keys))
    data.sort(reverse=True)
    print(data[0])
    
    #select top 500 words as features
    for i in range(0,500):
        features.append(data[i][1])
        
    return features
        

In [19]:
#This function converts dataset into a 2-D array with columns as features and rows corresponding to each document

def DataSet(data, features):
    stop_words = set(stopwords.words('english'))
    #define a numpy array as data_x
    data_x = np.zeros((len(data),len(features)))
    i = 0
    #we will go through each document
    for document in data:
        #remove all stopwords, numbers, special symbols from each documents
        
        removedNumber = re.sub(r'[0-9]+', ' ', document)
        str1 = re.sub(r"[^a-zA-Z0-9]+", ' ', removedNumber)
        str2 = re.sub(r'\b\w{1,3}\b', ' ', str1)
        word_tokens = word_tokenize(str2) 
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        #for each word in the document
        for word in filtered_sentence:
            
            #go through each word in filtered_sentence and convert data
            #in numpy array according to frequency of word in document
            for word2, j in zip(features, range(len(features))):
                if word == word2:
                    data_x[i][j] += 1
                    break
        i += 1
        
    return data_x

In [35]:
#fit function for obtaining the sum of frequencies of all the words
#in each class of Y_train
def fit(X1, Y_train, feature):
    #count is dictionary for all values of unique_class
    count = {}
    #unique classes 
    class_value = set(Y_train)
    for unique_class in class_value:
        #for each class create dictionary for keeping the sum
        #of each words in a feature set
        count[unique_class] = {}
        count["total_count"] = len(Y_train)
        current_class_rows = (Y_train == unique_class)
        X_train_current = X1[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        #total_count1 is the count of Y_train belonging to a particular unique_class  
        count[unique_class]["total_count1"] = len(Y_train_current)
        count[unique_class]["total_data"] = X_train_current[:,:].sum()
        
        for i in range(1,len(feature)+1):
            #for each feature obtain sum of all the frequencies in the X_train_current row 
            count[unique_class][i] = X_train_current[:,i-1].sum()
            
    return count

In [39]:
#probability function to return probability value for each class
def probability(dictionary, x, current_class):
    #inital output value for each class
    output = np.log(dictionary[current_class]["total_count1"]) - np.log(dictionary["total_count"])
    number = len(dictionary[current_class].keys())-2
    #for loop over all features  
    for j in range(1,number+1):
        #xj for getting if the word is part of the feature_set
        xj = x[j-1]
        
        # prob formula= (number of that word in each class)/(sum of all the words in that class) 
       
        
        if xj != 0:
             #do laplace correction
            count_current_class_with_value_xj = dictionary[current_class][j] + 1
            count_current_class = dictionary[current_class]["total_data"] + len(dictionary[current_class].keys()) - 2
            
            #use log probability
            current_xj_probablity = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
            
            #add output values to obtain final probability value for each class
            output = output + current_xj_probablity
    return output

In [47]:
#predictSinglePoint function that return the best class for a single row x
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == "total_count"):
            continue
        #for each class call probability function to obtain probability values
        p_current_class = probability(dictionary, x, current_class)
        #compare probability value obtain with best_p and assign the new value
        #to best_p
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [48]:
#predict function that return the predicited values for X_test
def predict(dictionary, X_test):
    y_pred = []
    for x in X_test:
        #for each X in X_test call predictSinglePoint
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [13]:
#feature_set obtain from X_train 
features = Features(X_train)

(22945, 'news')


In [37]:
features

['news',
 'cantaloupe',
 'Subject',
 'Date',
 'Message',
 'Newsgroups',
 'Path',
 'Lines',
 'Organization',
 'state',
 'would',
 'ohio',
 'writes',
 'comp',
 'References',
 'article',
 'talk',
 'misc',
 'Sender',
 'howland',
 'reston',
 'like',
 'University',
 'people',
 'know',
 'Posting',
 'Host',
 'zaphod',
 'think',
 'politics',
 'rutgers',
 'time',
 'near',
 'harvard',
 'crabapple',
 'also',
 'Xref',
 'good',
 'usenet',
 'could',
 'europa',
 'gtefsd',
 'uunet',
 'religion',
 'windows',
 'even',
 'make',
 'much',
 'world',
 'club',
 'rochester',
 'right',
 'many',
 'Distribution',
 'gatech',
 'want',
 'NNTP',
 'Nntp',
 'magnesium',
 'well',
 'said',
 'udel',
 'first',
 'used',
 'utexas',
 'culture',
 'system',
 'uiuc',
 'News',
 'work',
 'need',
 'anyone',
 'mail',
 'something',
 'andrew',
 'really',
 'sura',
 'believe',
 'problem',
 'hardware',
 'Reply',
 'back',
 'space',
 'christian',
 'years',
 'going',
 'still',
 'netcom',
 'point',
 'find',
 'might',
 'better',
 'nasa',
 'tak

In [23]:
#converting X_train into a 2D numpy array
X1_train = DataSet(X_train, features)

In [30]:
X1_train

array([[2., 1., 1., ..., 0., 0., 0.],
       [2., 2., 1., ..., 0., 0., 0.],
       [3., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 2., 1., ..., 0., 0., 0.],
       [1., 2., 1., ..., 0., 0., 0.],
       [4., 2., 1., ..., 0., 0., 1.]])

In [24]:
#converting X_test into a 2D  numpy array
X2_test = DataSet(X_test, features)

In [36]:
#call fit function
dictionary = fit(X1_train,Y_train, features)
dictionary

{0: {'total_count1': 769,
  'total_data': 57712.0,
  1: 1389.0,
  2: 973.0,
  3: 803.0,
  4: 782.0,
  5: 769.0,
  6: 773.0,
  7: 770.0,
  8: 782.0,
  9: 735.0,
  10: 604.0,
  11: 789.0,
  12: 545.0,
  13: 917.0,
  14: 0.0,
  15: 706.0,
  16: 732.0,
  17: 712.0,
  18: 354.0,
  19: 355.0,
  20: 460.0,
  21: 460.0,
  22: 403.0,
  23: 346.0,
  24: 695.0,
  25: 450.0,
  26: 363.0,
  27: 363.0,
  28: 341.0,
  29: 561.0,
  30: 3.0,
  31: 1.0,
  32: 343.0,
  33: 346.0,
  34: 365.0,
  35: 277.0,
  36: 210.0,
  37: 204.0,
  38: 287.0,
  39: 197.0,
  40: 295.0,
  41: 253.0,
  42: 253.0,
  43: 283.0,
  44: 627.0,
  45: 0.0,
  46: 302.0,
  47: 273.0,
  48: 230.0,
  49: 261.0,
  50: 147.0,
  51: 16.0,
  52: 207.0,
  53: 303.0,
  54: 127.0,
  55: 23.0,
  56: 145.0,
  57: 257.0,
  58: 135.0,
  59: 147.0,
  60: 220.0,
  61: 316.0,
  62: 24.0,
  63: 144.0,
  64: 152.0,
  65: 124.0,
  66: 37.0,
  67: 265.0,
  68: 270.0,
  69: 90.0,
  70: 82.0,
  71: 113.0,
  72: 127.0,
  73: 82.0,
  74: 269.0,
  75: 198.

In [43]:
#predict function to predict text 
Y_pred1 = predict(dictionary,X2)

In [44]:
##classification report using inbulid sklearn MultinomialNB()
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
clf = MultinomialNB()
clf.fit(X1, Y_train)
Y_pred = clf.predict(X2)
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.76      0.72      0.74       231
           1       0.90      0.68      0.77       266
           2       0.70      0.90      0.79       261
           3       0.62      0.58      0.60       251
           4       0.61      0.73      0.67       253
           5       0.82      0.59      0.68       259
           6       0.76      0.95      0.85       247
           7       0.82      0.89      0.85       255
           8       0.82      0.92      0.87       270
           9       0.95      0.90      0.93       252
          10       0.93      0.97      0.95       249
          11       0.94      0.94      0.94       253
          12       0.79      0.88      0.83       227
          13       0.85      0.58      0.69       245
          14       0.90      0.88      0.89       249
          15       0.98      1.00      0.99       256
          16       0.76      0.92      0.84       256
          17       0.94    

In [46]:
##classification_report for predicted values using naive bayes code
print(classification_report(Y_test,Y_pred1))
print(confusion_matrix(Y_test,Y_pred1))

              precision    recall  f1-score   support

           0       0.68      0.84      0.75       231
           1       0.90      0.60      0.72       266
           2       0.72      0.89      0.80       261
           3       0.61      0.58      0.60       251
           4       0.63      0.75      0.68       253
           5       0.85      0.65      0.74       259
           6       0.75      0.92      0.83       247
           7       0.91      0.91      0.91       255
           8       0.86      0.98      0.91       270
           9       0.97      0.95      0.96       252
          10       0.96      0.99      0.97       249
          11       0.95      0.93      0.94       253
          12       0.73      0.96      0.83       227
          13       0.84      0.62      0.71       245
          14       0.93      0.82      0.87       249
          15       1.00      1.00      1.00       256
          16       0.69      0.95      0.80       256
          17       0.95    