In [20]:
import pandas as pd
import os
import random
import string
from nltk import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pickle


In [21]:
BASE_DIR="C:/Users/97455/Downloads/exercise_task/exercise_task/data/train"
LABELS=['business','entertainment','politics','sport','tech']


In [22]:
stop_words=set(stopwords.words('english'))
stop_words.add('said')


In [23]:
#Creating the spreadsheet with category and text columns

In [24]:
def create_data_set():
    with open('data.txt','w',encoding='utf8') as outfile:
        for label in LABELS:
            dir='%s%s' % (BASE_DIR,label)
            for filename in os.listdir(dir):
                fullfilename='%s%s' % (dir,filename)
                print(fullfilename)
                with open(fullfilename,'rb') as file:
                    text=file.read().decode(errors='replace').replace('\n','')
                    outfile.write('%s\t%s\t%s\n' % (label,filename,text))

In [25]:
# splitting each words

In [26]:
def setup_docs():
    docs=[]
    with open('data.txt','r', encoding='utf8') as datafile:
        for row in datafile:
            parts= rows.split['\t']
            doc=(parts[0],parts[2].strip())
            docs.append(doc)
    
    return docs

In [27]:
# making the list of all word for each category

In [28]:
def clean_text(text):
    #remove punctuation
    text=text.translate(str.maketrans('','',string.punctuation))
    #convert to lowecase
    text=text.lowe()
    return(text)

In [29]:
def print_frequency_dist(docs):
    tokens=defaultdict(list)
    
    for doc in docs:
        doc_label=doc[0]
        doc_text=clean_text[1]
        
        doc_tokens=word_tokenize(doc_text)
        tokens[doc_label].extend(doc_tokens)
        
    for category_label,category_tokens in tokens.items():
        print(category_label)
        fd=FreqDist(category_tokens)
        print(fd.most_common(20))
        
        
    
    
    

In [30]:
def get_splits(docs):
    #scramble docs
    random.shuffle(docs)
    
    x_train=[]
    y_train=[]
    x_test=[]
    y_test=[]
    pivot=int(.80*len(docs))
    
    for i in range(0,pivot):
        x_train.append(docs[i][1])
        y_train.append(docs[i][0])
    for i in range(0,len(docs)):
        x_test.append(docs[i][1])
        y_test.append(docs[i][0])
    return x_train,x_test,y_train,y_test

In [31]:
def evaluate_classifier(title,classifier,vectorizer,x_test,y_test):
    x_test_tfidf=vectorizer.transform(x_test)
    y_pred=classifier.predict(x_test_tfidf)

    
    precision=metrics.precision_score(y_test,y_pred)
    recall=metrics.recall_score(y_test,y_pred)
    f1=metrics.f1_score(y_test,y_pred)
    
    print("%s\t%f\t%f\t%f\n") % (title,precision,recall,f1)

In [32]:
def train_classifier(docs):
    x_train,x_test,y_train,y_test=get_splits(docs)
    vectorizer=CountVectorizer(stop_words='english',ngram_range=(1,3),min_df=3,analyzer='word')
    #create doc-term matrix
    dtm= vectorizer.fit_transform(x_train)
    # train naive bayes classifier
    naive_bayes_classifier=MultinomialNB().fit(dtm,y_train)
    evaluate_classifier("Naive Bayes\tTRAIN\t",naive_bayes_classifier,vectorizer,x_train)
    evaluate_classifier("Naive Bayes\tTRAIN\t",naive_bayes_classifier,vectorizer,x_test)
    # store the classifier
    clf_file_name='naive_bayes_classifier.pk1'
    pickle.dump(naive_bayes_classifier,open(clf_filename,'wb'))
    # storing vectorizer inorder to transform new data
    vec_filename='count_vectorizer.pk1'
    pickle.dump(vectorizer,open(vec_filename,'wb'))
    

In [33]:
def classify(text):
    clf_filename='naive_bayes_classifier.pk1'
    nb_clf=pickle.load(open(clf_filename,'rb'))
    
    #vectorize the new text
    vec_filename='count_vectorizer.pk1'
    vectorizer=pickle.load(open(vec_filename,'rb'))
    
    pred=nb_clf.predict(vectorizer.transform(text))
    
    print(pred[0])