In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import warnings
import os 
import re

def getCorpus(link,tag):
    html = urlopen(link).read().decode('utf-8','ignore')
    soup = BeautifulSoup(html, features='lxml')
    all_href = soup.find_all(tag)
    
    corpus = []
    for i in all_href:
        text =  i.get_text()
        cleaned = re.sub(r'\n','',text)
        corpus.append(cleaned)
    
    return corpus
 

In [2]:
positivecorpus = getCorpus('https://storm.cis.fordham.edu/~yli/data/electronics/positive.review','review_text')
negativecorpus = getCorpus('https://storm.cis.fordham.edu/~yli/data/electronics/negative.review','review_text')

In [3]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer 
from collections import Counter
import string
ps = PorterStemmer() 
flatten = lambda l: [item for sublist in l for item in sublist]

class Dataset(object):
    def __init__(self,corpus,tokenizer,stemmer,
                 remove_stop = True, 
                 keep_pantuation = True):
        super(Dataset,self)
        self.token_lookup = self._lookup()
        self.vocab2idx = None
        self.idx2vocab = None
        self.word_counter = None
        self._flatten = None
        self.tokenizer = tokenizer
        self.tokens = None
        self.int_text = None
        self.stemmer = stemmer
        self.keep_pantuation = keep_pantuation
        self.remove_stop = remove_stop
        self.corpus = self._update(corpus)
        
        
        
    def _lookup(self):
        ''' lookup table to keep the puntuation !'''
        answer = {'.' : '||period||',
                  ',' : '||comma||',
                  '"' : '||quotation_mark||',
                  ';' : '||semicolon||',
                  '!' : '||exclamation_mark||',
                  '?' : '||question_mark||',
                  '(' : '||left_Parentheses||',
                  ')' : '||right_Parentheses||',
                  #'\n': '||return||',
                  '-' : '||dash||'}
        return answer
        
    def _update(self,text):
        #text = flatten(text)
        #print(text)
        if self.keep_pantuation:
            text = [self.preprocessing(t) for t in text]
        else:
            text = [re.sub(r"[{}]+".format(string.punctuation),'',t)  for t in text]
        #print(text)
            
        text = [self.stemmer.stem(t.lower()) for t in text]

        tokens = [self.tokenizer(t) for t in text]
        tokens = flatten(tokens)
        
        print(len(tokens))
        if self.remove_stop:
            tokens = self.remove(tokens)
        
        self.tokens = tokens
        #print(tokens)
        self.word_counter = Counter(self.tokens)
        self.vocab2idx, self.idx2vocab = self.create_lookup_tables(text + list(SPECIAL_WORDS.values()))
        self.int_text = [self.vocab2idx[word] for word in text]
        
        return text
        
    def create_lookup_tables(self,text):
        vocab_to_int = { v:i+2 for i,v in enumerate(set(text))}
        vocab_to_int['<START>'] = 0
        vocab_to_int['<end>'] = 1
        int_to_vocab = { v:k for k,v in vocab_to_int.items()}
        # return tuple
        return (vocab_to_int, int_to_vocab)
            
            
    def remove(self,text):
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in text if word not in stopwords.words('english')]
        
        return filtered_words
        
            
    def preprocessing(self,text):
        for key, token in self.token_lookup.items():
            text = text.replace(key, ' {} '.format(token))
        return text
        
        
    def getmostwords(self,k):
        
        return sorted(self.word_counter.most_common(k), key=lambda x: x[1])
    
    def getToken(self):
        return self.tokens
    
    def getTokenset(self):
        return set(self.tokens)
            
        
        
    #def _get_vocab(self)

In [4]:
p = Dataset(corpus=positivecorpus, tokenizer=word_tokenize, stemmer=ps,keep_pantuation= False)
n = Dataset(corpus=positivecorpus, tokenizer=word_tokenize, stemmer=ps,keep_pantuation= False)

107226
107226


In [5]:
p.getmostwords(20)

[('works', 184),
 ('ipod', 185),
 ('ive', 193),
 ('used', 202),
 ('phone', 203),
 ('bought', 206),
 ('price', 217),
 ('dont', 224),
 ('product', 230),
 ('well', 236),
 ('also', 240),
 ('get', 256),
 ('would', 258),
 ('quality', 274),
 ('like', 284),
 ('sound', 355),
 ('good', 399),
 ('great', 402),
 ('one', 429),
 ('use', 434)]

In [6]:
class Models(object):
    def __init__(self,X,y):
        self.X = X
        self.y = y