## Imports

In [1]:
import urllib
import requests
from bs4 import BeautifulSoup
import re
import os
import pandas as pd

from sklearn.feature_extraction.text import  CountVectorizer, TfidfVectorizer

from sklearn import naive_bayes, metrics, model_selection, linear_model

### Init 'global' variables

In [2]:
#ignore_dir = ['IT_company']
ignore_dir = []

routes = os.listdir('./SW')
routes = list(set(routes)-set(ignore_dir))

categories = routes


label_cat_dict = dict(zip(routes, range(0, len(routes))))

### Create dataframe from webpage files

In [3]:
def create_df(route, label):
    pages_df = pd.DataFrame(columns=['document', 'category'])
    files = os.listdir('./SW/'+route)
    for file, i in zip(files, range(0, len(files))):
        f = open('./SW/'+route + "/" + file, "rt", errors='ignore')
        
        page = f.read()
        f.close()
        soup_page = BeautifulSoup(page, "html.parser")
        page_content = soup_page.get_text()
        cleaned_page_content = re.sub("\W"," ",page_content)
        cleaned_page_content = re.sub(r" +"," ",cleaned_page_content)
        pages_df.loc[i] = (cleaned_page_content, label)
    return pages_df

### Read the data from differenct directories

In [4]:
all_pages = pd.DataFrame(columns=['document', 'category', 'label'])

train_x = pd.Series()
valid_x = pd.Series()
train_y = pd.Series()
valid_y = pd.Series()

for route, category in zip(routes, categories):
    label = label_cat_dict[category]
    pages_df = create_df(route, category)
    pages_df['label'] =label
    all_pages = all_pages.append(pages_df, ignore_index=True)
    
    train_x_, valid_x_, train_y_, valid_y_ = model_selection.train_test_split(pages_df['document'], pages_df['label'], test_size =  0.15)
    
    train_x = train_x.append(train_x_, ignore_index=True)
    valid_x = valid_x.append(valid_x_, ignore_index=True)
    train_y = train_y.append(train_y_, ignore_index=True)
    valid_y = valid_y.append(valid_y_, ignore_index=True)
    print(category, "-ready")

IT_company -ready
Goats -ready
BioMedical -ready
Sheep -ready
Bands -ready


### Word Count vectorizer

In [12]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(all_pages['document'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [34]:
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

### TF-IDF vectorizer

In [35]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(all_pages['document'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

### Naiv Bayes - word count

In [41]:
classifier = naive_bayes.MultinomialNB()

classifier.fit(xtrain_count, train_y)

predictions = classifier.predict(xvalid_count)

metrics.accuracy_score(predictions, valid_y)

0.8648648648648649

### Naiv Bayes - TF-IDF

In [45]:
classifier = naive_bayes.MultinomialNB()

classifier.fit(xtrain_tfidf, train_y)

predictions = classifier.predict(xvalid_tfidf)

metrics.accuracy_score(predictions, valid_y)

0.8513513513513513

### Linear - word count

In [52]:
classifier = linear_model.LogisticRegression(solver='liblinear',multi_class='auto')

classifier.fit(xtrain_count, train_y)

predictions = classifier.predict(xvalid_count)

metrics.accuracy_score(predictions, valid_y)

0.9324324324324325

### Linear - TF-IDF

In [47]:
classifier = linear_model.LogisticRegression(solver='lbfgs')

classifier.fit(xtrain_tfidf, train_y)

predictions = classifier.predict(xvalid_tfidf)

metrics.accuracy_score(predictions, valid_y)

0.9459459459459459

### TLDR

In [65]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import time
import random

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
word_dict = {}
start = time.time()
for index, row in all_pages.iterrows():
    words = re.split(' ', row['document'])
    
    for word in words:
        word = word.lower()
        if word not in set(stopwords.words('english')) and len(word) > 2:
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] = word_dict[word] + 1
    if index % 10 == 0:
        now = time.time()
        percent_ready = (index / all_pages.shape[0]) + 0.0000001
        print(index,'expected finish:', time.ctime(start + ((now-start) / percent_ready)), end='\r')

470 expected finish: Sun May 17 01:58:47 2020

In [55]:
word_dict_list = sorted(word_dict.items(), key=lambda x: x[1], reverse = True)
words_shrinked = word_dict_list[0:6000]
word_list = [w for (w,n) in words_shrinked]

In [56]:
def document_features(document):
    document_words = set(re.split(' ',document))
    features = {}
    for word in word_list:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [59]:
train_set = []
for ind, row in all_pages.iterrows():
    feature = (document_features(row['document']), row['category'])
    train_set.append(feature)
random.shuffle(train_set)

In [68]:
classifier_nb = nltk.NaiveBayesClassifier.train(train_set[0:400])

In [69]:
nltk.classify.accuracy(classifier_nb, train_set[400:len(train_set)])

0.6986301369863014

In [70]:
classifier_nb.show_most_informative_features(100)

Most Informative Features
          contains(mins) = True            Bands : IT_com =     74.0 : 1.0
          contains(true) = True           IT_com : BioMed =     66.1 : 1.0
        contains(script) = True           IT_com : BioMed =     61.4 : 1.0
       contains(display) = True           IT_com : BioMed =     55.3 : 1.0
    contains(background) = True           IT_com : BioMed =     54.6 : 1.0
         contains(media) = True           IT_com : BioMed =     53.3 : 1.0
        contains(global) = True           IT_com : BioMed =     52.6 : 1.0
        contains(center) = True           IT_com : BioMed =     50.5 : 1.0
         contains(songs) = True            Bands : IT_com =     48.2 : 1.0
          contains(open) = True           IT_com : BioMed =     47.8 : 1.0
         contains(ready) = True           IT_com : BioMed =     47.1 : 1.0
          contains(head) = True           IT_com : BioMed =     46.5 : 1.0
          contains(song) = True            Bands : IT_com =     44.4 : 1.0