In [182]:
import os
import random
import string
from nltk import word_tokenize
from collections import defaultdict
from nltk import FreqDist
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn import metrics
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [94]:
#define stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('said')
stop_words.add('mr')

In [50]:
BASE_DIR = 'D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles'
LABELS = ['business', 'entertainment', 'politics', 'sport', 'tech']
def create_data_set():
    with open('data.txt', 'w', encoding = 'utf8') as outfile:
        for label in LABELS:
            dir = '%s/%s' % (BASE_DIR, label)
            for filename in os.listdir(dir):
                fullfilename = '%s/%s' % (dir, filename)
                print(fullfilename)
                with open(fullfilename, 'rb') as file:
                    text = file.read().decode(errors='replace').replace('\n', '')
                    outfile.write('%s\t%s\t%s\n' % (label, filename, text))

In [60]:
def setup_docs():
    docs = []
    with open('data.txt', 'r', encoding='utf8') as datafile:
        for row in datafile:
            parts = row.split('\t')
            doc = (parts[0], parts[2].strip())
            docs.append(doc)
    return docs

In [79]:
def clean_text(text):
    #remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #convert into lower case
    text = text.lower()
    return text

In [91]:
def get_token(text):
    tokens =  word_tokenize(text)
    #remove stop words
    tokens = [t for t in tokens if not t in stop_words]
    return tokens

In [92]:
def print_frequency_dist(docs):
    tokens = defaultdict(list)

    for doc in docs:
        doc_label = doc[0]
        doc_text = clean_text(doc[1])
        doc_tokens = get_token(doc_text)
        tokens[doc_label].extend(doc_tokens)

    for category_label, category_tokens in tokens.items():
        print(category_label)
        fd = FreqDist(category_tokens)
        print(fd.most_common(20))

In [96]:
def get_splits(docs):
    random.shuffle(docs)
    x_train = []
    y_train = []
    x_dev = []
    y_dev = []
    x_test = []
    y_test = []
    pivot = int(0.80 * len(docs))
    for i in range(0, pivot):
        x_train.append(docs[i][1])
        y_train.append(docs[i][0])
        
    for i in range(pivot, len(docs)):
        x_test.append(docs[i][1])
        y_test.append(docs[i][0])
        
    return x_train, x_test, y_train, y_test

In [163]:
def evaluate_classifier(title, classifier, vectorizer, x_test, y_test):
    x_test_tfidf = vectorizer.transform(x_test)
    y_pred = classifier.predict(x_test_tfidf)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='weighted')
    recall = metrics.recall_score(y_test, y_pred, average='weighted')
    f1 = metrics.f1_score(y_test, y_pred, average='weighted')
    print("%s\t%f\t%f\t%f\t%f\n" % (title, accuracy, precision, recall, f1))

In [117]:
x_train, x_test, y_train, y_test = get_splits(docs)

In [133]:
# Import GridSearch from the sklearn library
from sklearn.model_selection import GridSearchCV
# The param_grid specifies one grid should be explored
param_grid = {'C': [1, 10, 100, 1000]}
# Fit on the dataset on all parameter combinations in param_grid
# Retain the best combination
grid_search = GridSearchCV(SVC(kernel = 'linear'), param_grid, cv = 5)

In [134]:
# Train model while tuning the parameters using grid search
grid_result = grid_search.fit(dtm, y_train)

In [135]:
grid_result

GridSearchCV(cv=5, estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 10, 100, 1000]})

# Cross Validation 

In [148]:
vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
dtm = vectorizer.fit_transform(x_train)
svmlinear = SVC(kernel = 'linear', C = 1)
# Use 5-fold cross validation to perform training and validation on the training set
# Parameter scoring = 'accuracy' will compute accuracy
accuracy = cross_val_score(svmlinear, dtm, y_train, cv = 5, scoring = 'accuracy')
precision = cross_val_score(svmlinear, dtm, y_train, cv = 5, scoring = 'precision_weighted')
recall = cross_val_score(svmlinear, dtm, y_train, cv = 5, scoring = 'recall_weighted')
f1 = cross_val_score(svmlinear, dtm, y_train, cv = 5, scoring = 'f1_weighted')
# Display the array containing accuracy from 10 folds or iterations
print('validation accuracy score (SVM) =', accuracy.mean())
print('validation precision score (SVM) =', precision.mean())
print('validation recall score (SVM) =', recall.mean())
print('validation f1 score (SVM) =', f1.mean())

validation accuracy score (SVM) = 0.9713483146067416
validation precision score (SVM) = 0.9720481469436534
validation recall score (SVM) = 0.9713483146067416
validation f1 score (SVM) = 0.9713320894054217


In [169]:
vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
dtm = vectorizer.fit_transform(x_train)
svmpoly = SVC(kernel = 'poly', C = 1)
# Use 5-fold cross validation to perform training and validation on the training set
# Parameter scoring = 'accuracy' will compute accuracy
accuracy = cross_val_score(svmpoly, dtm, y_train, cv = 5, scoring = 'accuracy')
precision = cross_val_score(svmpoly, dtm, y_train, cv = 5, scoring = 'precision_weighted')
recall = cross_val_score(svmpoly, dtm, y_train, cv = 5, scoring = 'recall_weighted')
f1 = cross_val_score(svmpoly, dtm, y_train, cv = 5, scoring = 'f1_weighted')
# Display the array containing accuracy from 10 folds or iterations
print('validation accuracy score (SVMP) =', accuracy.mean())
print('validation precision score (SVMP) =', precision.mean())
print('validation recall score (SVMP) =', recall.mean())
print('validation f1 score (SVMP) =', f1.mean())

validation accuracy score (SVMP) = 0.46292134831460674
validation precision score (SVMP) = 0.8176931047190156
validation recall score (SVMP) = 0.46292134831460674
validation f1 score (SVMP) = 0.45176648147274767


In [166]:
vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
dtm = vectorizer.fit_transform(x_train)
nb_classifier = MultinomialNB(alpha = 1.0)
# Use 5-fold cross validation to perform training and validation on the training set
# Parameter scoring = 'accuracy' will compute accuracy
accuracy = cross_val_score(nb_classifier, dtm, y_train, cv = 5, scoring = 'accuracy')
precision = cross_val_score(nb_classifier, dtm, y_train, cv = 5, scoring = 'precision_weighted')
recall = cross_val_score(nb_classifier, dtm, y_train, cv = 5, scoring = 'recall_weighted')
f1 = cross_val_score(nb_classifier, dtm, y_train, cv = 5, scoring = 'f1_weighted')
# Display the array containing accuracy from 10 folds or iterations
print('validation accuracy score (NB) =', accuracy.mean())
print('validation precision score (NB) =', precision.mean())
print('validation recall score (NB) =', recall.mean())
print('validation f1 score (NB) =', f1.mean())

validation accuracy score (NB) = 0.9792134831460674
validation precision score (NB) = 0.9794949487256588
validation recall score (NB) = 0.9792134831460674
validation f1 score (NB) = 0.9792370722404528


In [171]:
vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
dtm = vectorizer.fit_transform(x_train)
lg = LogisticRegression(penalty='none')
# Use 5-fold cross validation to perform training and validation on the training set
# Parameter scoring = 'accuracy' will compute accuracy
accuracy = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'accuracy')
precision = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'precision_weighted')
recall = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'recall_weighted')
f1 = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'f1_weighted')
# Display the array containing accuracy from 10 folds or iterations
print('validation accuracy score (LG) =', accuracy.mean())
print('validation precision score (LG) =', precision.mean())
print('validation recall score (LG) =', recall.mean())
print('validation f1 score (LG) =', f1.mean())

validation accuracy score (LG) = 0.9780898876404495
validation precision score (LG) = 0.978491957179523
validation recall score (LG) = 0.9780898876404495
validation f1 score (LG) = 0.9781030301984698


In [173]:
vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
dtm = vectorizer.fit_transform(x_train)
lg = LogisticRegression(penalty='l2')
# Use 5-fold cross validation to perform training and validation on the training set
# Parameter scoring = 'accuracy' will compute accuracy
accuracy = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'accuracy')
precision = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'precision_weighted')
recall = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'recall_weighted')
f1 = cross_val_score(lg, dtm, y_train, cv = 5, scoring = 'f1_weighted')
# Display the array containing accuracy from 10 folds or iterations
print('validation accuracy score (LG l2) =', accuracy.mean())
print('validation precision score (LG l2) =', precision.mean())
print('validation recall score (LG l2) =', recall.mean())
print('validation f1 score (LG l2) =', f1.mean())

validation accuracy score (LG l1) = 0.9747191011235955
validation precision score (LG l1) = 0.9753022322291571
validation recall score (LG l1) = 0.9747191011235955
validation f1 score (LG l1) = 0.9746939124636217


# Def for result generation 

In [None]:
def evaluate_classifier(title, classifier, vectorizer, x_test, y_test):
    x_test_tfidf = vectorizer.transform(x_test)
    y_pred = classifier.predict(x_test_tfidf)
    precision = metrics.precision_score(y_test, y_pred, average='macro')
    recall = metrics.recall_score(y_test, y_pred, average='macro')
    f1 = metrics.f1_score(y_test, y_pred, average='macro')
    print("%s\t%f\t%f\t%f\n" % (title, precision, recall, f1))

In [152]:
def nb_classifer(docs):
    x_train, x_test, y_train, y_test = get_splits(docs)
    vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
    dtm = vectorizer.fit_transform(x_train)
    nb_classifier = MultinomialNB().fit(dtm, y_train)
    evaluate_classifier("Naive Bayes\tTest\t", nb_classifier, vectorizer, x_test, y_test)

In [157]:
def SVM(docs):
    x_train, x_test, y_train, y_test = get_splits(docs)
    vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
    dtm = vectorizer.fit_transform(x_train)
    SVM = svmlinear.fit(dtm, y_train)
    evaluate_classifier("SVM\tTest\t", SVM, vectorizer, x_test, y_test)

In [174]:
def SVMP(docs):
    x_train, x_test, y_train, y_test = get_splits(docs)
    vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
    dtm = vectorizer.fit_transform(x_train)
    SVMP = svmpoly.fit(dtm, y_train)
    evaluate_classifier("SVMP\tTest\t", SVMP, vectorizer, x_test, y_test)

In [177]:
def LGN(docs):
    x_train, x_test, y_train, y_test = get_splits(docs)
    vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
    dtm = vectorizer.fit_transform(x_train)
    lg = LogisticRegression(penalty='none')
    lgn = lg.fit(dtm, y_train)
    evaluate_classifier("LG(none)\tTest\t", lgn, vectorizer, x_test, y_test)

In [180]:
def LG2(docs):
    x_train, x_test, y_train, y_test = get_splits(docs)
    vectorizer = CountVectorizer(stop_words = 'english',
                                ngram_range = (1, 3),
                                min_df = 3, analyzer = 'word')
    dtm = vectorizer.fit_transform(x_train)
    lg = LogisticRegression(penalty='l2')
    lg2 = lg.fit(dtm, y_train)
    evaluate_classifier("LG(2)\tTest\t", lg2, vectorizer, x_test, y_test)

In [61]:
create_data_set()

D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/001.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/002.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/003.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/004.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/005.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/006.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/007.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/008.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/business/009.txt
D:/USM/Data Science/CDS 522 

D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/148.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/149.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/150.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/151.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/152.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/153.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/154.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/entertainment/155.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/enter

D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/383.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/384.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/385.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/386.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/387.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/388.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/389.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/390.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/politics/391.txt
D:/USM/Data Science/CDS 522 

D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/102.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/103.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/104.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/105.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/106.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/107.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/108.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/109.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC News Summary/News Articles/tech/110.txt
D:/USM/Data Science/CDS 522 speech/Project/BBC News Summary/BBC 

In [62]:
docs = setup_docs()

Result before removing stopwords and punctuations

In [69]:
print_frequency_dist(docs)

business
[('the', 9057), (',', 7440), ('.', 6516), ('to', 5046), ('of', 4369), ('in', 4056), ('a', 3244), ('and', 3148), ("'s", 2023), ('said', 1623), ('``', 1621), ('is', 1614), ('for', 1567), ('that', 1559), ("''", 1417), ('on', 1302), ('%', 1290), ('has', 1255), ('The', 1232), ('it', 1137)]
entertainment
[('the', 6474), (',', 5927), ('.', 5046), ('and', 2939), ('of', 2927), ('to', 2864), ('in', 2664), ('a', 2518), ("'s", 1833), ('for', 1514), ('``', 1482), ('The', 1452), ("''", 1187), ('was', 1148), ('on', 1123), ('is', 1090), ('with', 918), ('at', 802), ('said', 793), ('that', 749)]
politics
[('the', 10599), ('.', 7158), (',', 6539), ('to', 5979), ('of', 4457), ('and', 3804), ('a', 3702), ('``', 3595), ('in', 3144), ("''", 2680), ('said', 2152), ('for', 1877), ('that', 1856), ('is', 1850), ('on', 1774), ("'s", 1726), ('be', 1672), ('was', 1589), ('he', 1557), ('Mr', 1532)]
sport
[('the', 8665), ('.', 7434), (',', 6942), ('to', 4678), ('a', 3683), ('in', 3546), ('and', 3472), ('``',

After remove punctuation

In [83]:
print_frequency_dist(docs)

business
[('the', 10245), ('to', 5063), ('of', 4377), ('in', 4264), ('a', 3356), ('and', 3197), ('said', 1618), ('is', 1608), ('for', 1600), ('that', 1582), ('it', 1373), ('on', 1344), ('has', 1252), ('its', 1150), ('by', 1090), ('as', 922), ('at', 917), ('was', 916), ('with', 895), ('be', 870)]
entertainment
[('the', 7902), ('and', 3005), ('of', 2965), ('to', 2892), ('in', 2776), ('a', 2672), ('for', 1540), ('on', 1160), ('was', 1140), ('is', 1080), ('with', 939), ('at', 833), ('it', 826), ('said', 789), ('he', 764), ('that', 741), ('film', 698), ('as', 697), ('has', 695), ('be', 679)]
politics
[('the', 11720), ('to', 5990), ('of', 4466), ('and', 3950), ('a', 3830), ('in', 3324), ('said', 2138), ('he', 2108), ('for', 1909), ('that', 1892), ('is', 1834), ('on', 1826), ('be', 1670), ('was', 1575), ('mr', 1505), ('it', 1391), ('not', 1131), ('as', 1078), ('have', 1052), ('would', 1051)]
sport
[('the', 9372), ('to', 4705), ('a', 3834), ('and', 3657), ('in', 3637), ('of', 2800), ('for', 17

Stop words removal

In [95]:
print_frequency_dist(docs)

business
[('us', 753), ('year', 571), ('would', 463), ('also', 439), ('new', 410), ('market', 400), ('growth', 363), ('company', 362), ('last', 356), ('economy', 327), ('firm', 313), ('could', 311), ('bank', 306), ('economic', 303), ('sales', 302), ('government', 294), ('oil', 287), ('2004', 282), ('years', 263), ('may', 246)]
entertainment
[('film', 698), ('best', 582), ('music', 413), ('also', 398), ('us', 348), ('one', 340), ('years', 326), ('new', 315), ('show', 287), ('first', 250), ('last', 248), ('awards', 246), ('year', 240), ('number', 227), ('award', 220), ('uk', 206), ('films', 203), ('two', 202), ('director', 201), ('tv', 197)]
politics
[('would', 1051), ('government', 635), ('labour', 587), ('people', 584), ('election', 517), ('blair', 495), ('party', 468), ('also', 450), ('new', 425), ('could', 384), ('minister', 381), ('told', 358), ('brown', 324), ('public', 314), ('plans', 289), ('howard', 286), ('uk', 286), ('one', 285), ('prime', 276), ('say', 266)]
sport
[('game', 4

# Result

In [165]:
nb_classifer(docs)

Naive Bayes	Test		0.979775	0.980277	0.979775	0.979776



In [164]:
SVM(docs)

SVM	Test		0.968539	0.968770	0.968539	0.968563



In [175]:
SVMP(docs)

SVMP	Test		0.456180	0.814683	0.456180	0.448532



In [179]:
LGN(docs)

LG(none)	Test		0.979775	0.979906	0.979775	0.979746



In [181]:
LG2(docs)

LG(2)	Test		0.984270	0.984373	0.984270	0.984297

