In [81]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import os,glob
import re
from nltk.corpus import stopwords

In [37]:
source = [(r'data/Train_data/Nonsports/', 'NONSPORTS'),
          (r'data/Train_data/Sports/', 'SPORTS')
          ]

In [38]:
def build_data_frame(fpath, label):
    rows = []
    for file_name in os.listdir(fpath):
        path = os.path.join(fpath,file_name)
        rows.append({'text': path, 'labels': label })
    data_frame = pd.DataFrame(rows)
    return(data_frame)

In [39]:
data = pd.DataFrame({'text': [], 'labels':[]})

In [40]:
for dpath, classification in source:
    data = data.append(build_data_frame(dpath, classification))
#print(data)
train = data['text'].values
targets = data['labels'].values


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [41]:
vectorizer = CountVectorizer(ngram_range=(1,1),token_pattern=r'\b\w+\b', min_df=2,lowercase=True, stop_words="english",input='filename',decode_error='ignore', strip_accents='unicode')

In [42]:
X = vectorizer.fit_transform((train))

In [43]:
vocab = vectorizer.get_feature_names()

In [33]:
features = pd.DataFrame(X.A, columns=vectorizer.get_feature_names())

# Using multinomial Naive bayes

In [34]:
clf = MultinomialNB()

In [35]:
clf = clf.fit(X,targets)

# Testing on new data

In [44]:
test_path = 'data/Test_data/'

In [45]:
text_data = []
for path in glob.glob(test_path+'\*.txt'):
    text_data.append(path)


In [50]:
text_data

['data/Test_data\\NeW_non_sport.txt',
 'data/Test_data\\Non_Sports16.txt',
 'data/Test_data\\Sports1.txt',
 'data/Test_data\\Sports2.txt',
 'data/Test_data\\Sports3.txt',
 'data/Test_data\\test1.txt',
 'data/Test_data\\test2.txt']

In [46]:
Y = vectorizer.transform(text_data).toarray()

In [47]:
predict = clf.predict(Y)

In [48]:
print(predict)

['NONSPORTS' 'NONSPORTS' 'SPORTS' 'SPORTS' 'SPORTS' 'SPORTS' 'NONSPORTS']


In [53]:
result = list(zip(text_data,predict))

In [55]:
result

[('data/Test_data\\NeW_non_sport.txt', 'NONSPORTS'),
 ('data/Test_data\\Non_Sports16.txt', 'NONSPORTS'),
 ('data/Test_data\\Sports1.txt', 'SPORTS'),
 ('data/Test_data\\Sports2.txt', 'SPORTS'),
 ('data/Test_data\\Sports3.txt', 'SPORTS'),
 ('data/Test_data\\test1.txt', 'SPORTS'),
 ('data/Test_data\\test2.txt', 'NONSPORTS')]

# using Tf-idf vectorizer

In [58]:
import codecs

In [79]:
def clean_data(data):
    # Remove punctuations, digits and stop words
    # Removal of repeated contents
    valid_text=[]
    stop = set(stopwords.words('english'))

    for text in data:
        words=re.findall(r'\b[a-zA-Z][a-z]{2,9}\b',text)
        valid_words=[]
        for word in words:
            if word not in stop:
                valid_words.append(word)
        valid_text.append(' '.join(valid_words))
    return valid_text

In [70]:
my_train = [codecs.open(file,encoding='utf-8',errors='ignore').read() for file in data['text'].values]

In [71]:
my_labels = data['labels'].values

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
tfidf_vectorizer = TfidfVectorizer()

In [82]:
my_clean_train = clean_data(my_train)

In [84]:
my_clean_train[0]

'The Indian economy appears absorbed twin shocks hasty Goods Services Tax course healthy growth medium term Economic Survey says listing quite parameters show economy healing The number indirect taxpayers per cent lakh new income tax filers coming forward year Besides large increase voluntary small per cent rise direct tax Therefore despite large scale disruption businesses economy general led greater wider tax net This surely good news Besides reversal decline exports revival meant gross domestic product estimates improved The Survey pegs current fiscal growth per cent ahead per cent Central Office estimated earlier And next fiscal growth set per cent India may become fastest growing economy planet Chief economic adviser Arvind taken incessant two numbers growth likely closer wisely There many much point sticking one neck precise'

In [85]:
fit_tfidf = tfidf_vectorizer.fit_transform((my_clean_train))

In [86]:
classifier_tfidf = MultinomialNB()

In [87]:
classifier_tfidf = classifier_tfidf.fit(fit_tfidf,my_labels)

In [93]:
# predicting on new test data


In [77]:
my_test = [codecs.open(file,encoding='utf-8',errors='ignore').read() for file in text_data]

In [88]:
my_test_clean = clean_data(my_test)

In [89]:
test_tfidf = tfidf_vectorizer.transform(my_test_clean).toarray()

In [90]:
predict_tfidf = classifier_tfidf.predict(test_tfidf)

In [91]:
print(predict_tfidf)

['NONSPORTS' 'NONSPORTS' 'SPORTS' 'SPORTS' 'SPORTS' 'SPORTS' 'NONSPORTS']


In [94]:
result_tfidf = list(zip(text_data,predict_tfidf))

In [96]:
result_tfidf

[('data/Test_data\\NeW_non_sport.txt', 'NONSPORTS'),
 ('data/Test_data\\Non_Sports16.txt', 'NONSPORTS'),
 ('data/Test_data\\Sports1.txt', 'SPORTS'),
 ('data/Test_data\\Sports2.txt', 'SPORTS'),
 ('data/Test_data\\Sports3.txt', 'SPORTS'),
 ('data/Test_data\\test1.txt', 'SPORTS'),
 ('data/Test_data\\test2.txt', 'NONSPORTS')]