In [1]:
import re
from glob import glob

import pandas as pd
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
file_paths = glob('data/20_newsgroups_dataset/**/*')
len(file_paths)

18828

In [3]:
data_list, label_list = [], []
for path in file_paths:
    try:
        with open(path, 'r') as file:
            data = file.readlines()
            data = ''.join(data)
            data_list.append(data)
            label = path.split('/')[-2]
            label_list.append(label)
    except:
        print(path)

data/20_newsgroups_dataset/rec.autos/103700
data/20_newsgroups_dataset/rec.autos/103725
data/20_newsgroups_dataset/rec.autos/103694
data/20_newsgroups_dataset/rec.autos/101596
data/20_newsgroups_dataset/comp.sys.mac.hardware/51892
data/20_newsgroups_dataset/comp.sys.mac.hardware/52165
data/20_newsgroups_dataset/comp.sys.mac.hardware/51941
data/20_newsgroups_dataset/comp.sys.mac.hardware/52196
data/20_newsgroups_dataset/comp.sys.mac.hardware/52164
data/20_newsgroups_dataset/comp.sys.mac.hardware/51593
data/20_newsgroups_dataset/comp.sys.mac.hardware/51594
data/20_newsgroups_dataset/comp.sys.mac.hardware/51592
data/20_newsgroups_dataset/comp.sys.mac.hardware/52033
data/20_newsgroups_dataset/comp.sys.mac.hardware/51865
data/20_newsgroups_dataset/comp.sys.mac.hardware/51916
data/20_newsgroups_dataset/comp.sys.mac.hardware/51917
data/20_newsgroups_dataset/comp.sys.mac.hardware/50467
data/20_newsgroups_dataset/comp.sys.mac.hardware/51904
data/20_newsgroups_dataset/comp.sys.mac.hardware/51591

In [4]:
len(data_list), len(label_list)

(18756, 18756)

In [5]:
df = pd.DataFrame(data={
    'data': data_list,
    'label': label_list
})
df

Unnamed: 0,data,label
0,From: hm@cs.brown.edu (Harry Mamaysky)\nSubjec...,talk.politics.mideast
1,From: waldo@cybernet.cse.fau.edu (Todd J. Dick...,talk.politics.mideast
2,From: C.L.Gannon@newcastle.ac.uk (Space Cadet)...,talk.politics.mideast
3,From: shaig@Think.COM (Shai Guday)\nSubject: B...,talk.politics.mideast
4,From: ez000281@hamlet.ucdavis.edu ()\nSubject:...,talk.politics.mideast
...,...,...
18751,From: porta@wam.umd.edu (David Palmer)\nSubjec...,talk.religion.misc
18752,From: decay@cbnewsj.cb.att.com (dean.kaflowitz...,talk.religion.misc
18753,From: ekr@kyle.eitech.com (Eric Rescorla)\nSub...,talk.religion.misc
18754,"From: ""David R. Sacco"" <dsav+@andrew.cmu.edu>\...",talk.religion.misc


In [6]:
def decontact(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [7]:
def process_string(stringg):
    stringg = decontact(stringg)

    # REPLACING NEW LINES BY 'WHITE SPACE'
    stringg = re.sub(r'\n', " ", stringg)
    # REPLACING NUMBERS
    stringg = re.sub(r'\d+(\.\d+)?', 'numbers', stringg)
    # REPLACING EMAIL IDs BY 'MAILID'
    stringg = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$','MailID', stringg)
    # REPLACING URLs  BY 'Links'
    stringg = re.sub(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','Links', stringg)
    # REPLACING CURRENCY SIGNS BY 'MONEY'
    stringg = re.sub(r'£|\$', 'Money', stringg)
    # REPLACING LARGE WHITE SPACE BY SINGLE WHITE SPACE
    stringg = re.sub(r'\s+', ' ', stringg)
    # REPLACING LEADING AND TRAILING WHITE SPACE BY SINGLE WHITE SPACE
    stringg = re.sub(r'^\s+|\s+?$', '', stringg)
    # REPLACING CONTACT NUMBERS
    stringg = re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','contact number', stringg)
    # REPLACING SPECIAL CHARACTERS  BY WHITE SPACE 
    stringg = re.sub(r"[^a-zA-Z0-9]+", " ", stringg)

    return stringg

In [8]:
df['data'] = df['data'].apply(process_string)
df

Unnamed: 0,data,label
0,From hm cs brown edu Harry Mamaysky Subject He...,talk.politics.mideast
1,From waldo cybernet cse fau edu Todd J Dicker ...,talk.politics.mideast
2,From C L Gannon newcastle ac uk Space Cadet Su...,talk.politics.mideast
3,From shaig Think COM Shai Guday Subject Basil ...,talk.politics.mideast
4,From eznumbers hamlet ucdavis edu Subject Re T...,talk.politics.mideast
...,...,...
18751,From porta wam umd edu David Palmer Subject Re...,talk.religion.misc
18752,From decay cbnewsj cb att com dean kaflowitz S...,talk.religion.misc
18753,From ekr kyle eitech com Eric Rescorla Subject...,talk.religion.misc
18754,From David R Sacco dsav andrew cmu edu Subject...,talk.religion.misc


In [9]:
# removing stopwords 
stop = stopwords.words('english')
df['clean_data'] = df['data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['length'] = df['clean_data'].apply(len)
df

Unnamed: 0,data,label,clean_data,length
0,From hm cs brown edu Harry Mamaysky Subject He...,talk.politics.mideast,From hm cs brown edu Harry Mamaysky Subject He...,948
1,From waldo cybernet cse fau edu Todd J Dicker ...,talk.politics.mideast,From waldo cybernet cse fau edu Todd J Dicker ...,776
2,From C L Gannon newcastle ac uk Space Cadet Su...,talk.politics.mideast,From C L Gannon newcastle ac uk Space Cadet Su...,508
3,From shaig Think COM Shai Guday Subject Basil ...,talk.politics.mideast,From shaig Think COM Shai Guday Subject Basil ...,1289
4,From eznumbers hamlet ucdavis edu Subject Re T...,talk.politics.mideast,From eznumbers hamlet ucdavis edu Subject Re T...,862
...,...,...,...,...
18751,From porta wam umd edu David Palmer Subject Re...,talk.religion.misc,From porta wam umd edu David Palmer Subject Re...,774
18752,From decay cbnewsj cb att com dean kaflowitz S...,talk.religion.misc,From decay cbnewsj cb att com dean kaflowitz S...,981
18753,From ekr kyle eitech com Eric Rescorla Subject...,talk.religion.misc,From ekr kyle eitech com Eric Rescorla Subject...,1200
18754,From David R Sacco dsav andrew cmu edu Subject...,talk.religion.misc,From David R Sacco dsav andrew cmu edu Subject...,411


In [10]:
label_unique_list = list(df.label.unique())
label_unique_list = sorted(label_unique_list)
label_unique_list

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [11]:
def create_label_idx(label):
    return label_unique_list.index(label)

In [12]:
df['label_idx'] = df['label'].apply(create_label_idx)
df

Unnamed: 0,data,label,clean_data,length,label_idx
0,From hm cs brown edu Harry Mamaysky Subject He...,talk.politics.mideast,From hm cs brown edu Harry Mamaysky Subject He...,948,17
1,From waldo cybernet cse fau edu Todd J Dicker ...,talk.politics.mideast,From waldo cybernet cse fau edu Todd J Dicker ...,776,17
2,From C L Gannon newcastle ac uk Space Cadet Su...,talk.politics.mideast,From C L Gannon newcastle ac uk Space Cadet Su...,508,17
3,From shaig Think COM Shai Guday Subject Basil ...,talk.politics.mideast,From shaig Think COM Shai Guday Subject Basil ...,1289,17
4,From eznumbers hamlet ucdavis edu Subject Re T...,talk.politics.mideast,From eznumbers hamlet ucdavis edu Subject Re T...,862,17
...,...,...,...,...,...
18751,From porta wam umd edu David Palmer Subject Re...,talk.religion.misc,From porta wam umd edu David Palmer Subject Re...,774,19
18752,From decay cbnewsj cb att com dean kaflowitz S...,talk.religion.misc,From decay cbnewsj cb att com dean kaflowitz S...,981,19
18753,From ekr kyle eitech com Eric Rescorla Subject...,talk.religion.misc,From ekr kyle eitech com Eric Rescorla Subject...,1200,19
18754,From David R Sacco dsav andrew cmu edu Subject...,talk.religion.misc,From David R Sacco dsav andrew cmu edu Subject...,411,19


In [13]:
X = df.clean_data
y = df.label_idx

X.shape, y.shape

((18756,), (18756,))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1511)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15004,), (3752,), (15004,), (3752,))

In [15]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer

In [16]:
naive_bayes_model = MultinomialNB()
naive_bayes_model

In [17]:
model_pipeline = Pipeline([('vectorizer', tfidf_vectorizer), ('classifier', naive_bayes_model)])
model_pipeline

In [18]:
model_pipeline.fit(X_train, y_train)

In [19]:
y_pred = model_pipeline.predict(X_test)
y_pred

array([ 1,  5, 10, ..., 10,  5, 13])

In [20]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89       157
           1       0.78      0.85      0.82       177
           2       0.87      0.84      0.86       200
           3       0.89      0.76      0.82       231
           4       0.85      0.94      0.89       175
           5       0.85      0.95      0.90       168
           6       0.80      0.94      0.86       180
           7       0.91      0.93      0.92       202
           8       0.98      0.90      0.94       199
           9       0.98      0.98      0.98       219
          10       0.98      0.96      0.97       231
          11       0.98      0.84      0.91       214
          12       0.81      0.87      0.84       175
          13       0.94      0.96      0.95       162
          14       0.96      0.92      0.94       215
          15       1.00      0.76      0.86       299
          16       0.98      0.80      0.88       212
          17       0.99    