In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import warnings
import os 
import re
import pandas as pd

def getCorpus(link,tag):
    html = urlopen(link).read().decode('utf-8','ignore')
    soup = BeautifulSoup(html, features='lxml')
    all_href = soup.find_all(tag)
    
    corpus = []
    for i in all_href:
        text =  i.get_text()
        cleaned = re.sub(r'\n','',text)
        corpus.append(cleaned)
    
    return corpus
 

In [2]:
positivecorpus = getCorpus('https://storm.cis.fordham.edu/~yli/data/electronics/positive.review','review_text')
negativecorpus = getCorpus('https://storm.cis.fordham.edu/~yli/data/electronics/negative.review','review_text')

In [3]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer 
from collections import Counter
import string
import pickle
ps = PorterStemmer() 
flatten = lambda l: [item for sublist in l for item in sublist]

class Dataset(object):
    def __init__(self,corpus,tokenizer,stemmer,
                 remove_stop = True, 
                 keep_pantuation = True):
        super(Dataset,self)
        self.token_lookup = self._lookup()
        self.vocab2idx = None
        self.idx2vocab = None
        self.word_counter = None
        self._flatten = None
        self.tokenizer = tokenizer
        self.tokens = None
        self.int_text = None
        self.stemmer = stemmer
        self.keep_pantuation = keep_pantuation
        self.remove_stop = remove_stop
        self.X = None
        self.corpus = self._update(corpus)
        
        
        
    def _lookup(self):
        ''' lookup table to keep the puntuation !'''
        answer = {'.' : '||period||',
                  ',' : '||comma||',
                  '"' : '||quotation_mark||',
                  ';' : '||semicolon||',
                  '!' : '||exclamation_mark||',
                  '?' : '||question_mark||',
                  '(' : '||left_Parentheses||',
                  ')' : '||right_Parentheses||',
                  #'\n': '||return||',
                  '-' : '||dash||'}
        return answer
        
    def _update(self,text):
        #text = flatten(text)
        #print(text)
        if self.keep_pantuation:
            text = [self.preprocessing(t) for t in text]
        else:
            text = [re.sub(r"[{}]+".format(string.punctuation),'',t)  for t in text]
        #print(text)
            
        text = [self.stemmer.stem(t.lower()) for t in text]

        tokens = [self.tokenizer(t) for t in text]
        
        
        if self.remove_stop:
            tokens = tokens = [self.remove(t) for t in tokens]
        self.X = tokens
        tokens = flatten(tokens)
        #print(len(tokens))
        self.tokens = tokens
        #print(tokens)
        self.word_counter = Counter(self.tokens)
        self.vocab2idx, self.idx2vocab = self.create_lookup_tables(text + list(SPECIAL_WORDS.values()))
        self.int_text = [self.vocab2idx[word] for word in text]
        
        return text
        
    def create_lookup_tables(self,text):
        vocab_to_int = { v:i+2 for i,v in enumerate(set(text))}
        vocab_to_int['<START>'] = 0
        vocab_to_int['<end>'] = 1
        int_to_vocab = { v:k for k,v in vocab_to_int.items()}
        # return tuple
        return (vocab_to_int, int_to_vocab)
            
            
    def remove(self,text):
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in text if word not in stopwords.words('english')]
        
        return filtered_words
        
            
    def preprocessing(self,text):
        for key, token in self.token_lookup.items():
            text = text.replace(key, ' {} '.format(token))
        return text
        
        
    def getmostwords(self,k):
        
        return sorted(self.word_counter.most_common(k), key=lambda x: x[1])
    
    def getToken(self):
        return self.tokens
    
    def getTokenset(self):
        return set(self.tokens) 
        
    def getdata(self,X,y):
        df = pd.DataFrame()
        df['review'] = X
        df['sentiment'] = y
        return df

In [4]:
p = Dataset(corpus=positivecorpus, tokenizer=word_tokenize, stemmer=ps,keep_pantuation= False)
n = Dataset(corpus=negativecorpus, tokenizer=word_tokenize, stemmer=ps,keep_pantuation= False)

In [5]:
p.getmostwords(20)

[('works', 184),
 ('ipod', 185),
 ('ive', 193),
 ('used', 202),
 ('phone', 203),
 ('bought', 206),
 ('price', 217),
 ('dont', 224),
 ('product', 230),
 ('well', 236),
 ('also', 240),
 ('get', 256),
 ('would', 258),
 ('quality', 274),
 ('like', 284),
 ('sound', 355),
 ('good', 399),
 ('great', 402),
 ('one', 429),
 ('use', 434)]

In [7]:
df_positive = p.getdata(p.X,y = 1)
df_negative = n.getdata(p.X,y = 0)
df = pd.concat([df_positive,df_negative],axis=0)
df = df.sample(frac=1).reset_index(drop=True)
df['review'] = df['review'].apply(lambda x : " ".join(x))

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,use pink circles buff cddvd leaves circular ma...,1
1,unit works great much faster transfers using u...,1
2,little bouncy running convenient product fits ...,0
3,fast shipping happy garmin tech support goodth...,1
4,headset works great comfortable wish better so...,0


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [10]:
x_train,x_test,y_train,y_test = train_test_split(df.review,df.sentiment,test_size = 0.3)
vectorizer = CountVectorizer()
v_x_train = vectorizer.fit_transform(x_train)
v_x_test = vectorizer.transform(x_test)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from datetime import datetime
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')

class Models(object):
    def __init__(self,X,y,kfolds):
        self.X = X
        self.y = y
        self.kfolds = kfolds
        
    def nb(self):
        nb = MultinomialNB()
        nb.fit(self.X,self.y)
        return nb
        
        
    def logistic(self):
        lr = make_pipeline(LogisticRegressionCV(cv=self.kfolds))
        lr.fit(self.X,self.y)

        return lr
    
    def main(self):
        print('START Fit')

        print(datetime.now(), 'NB')
        nb = self.nb()
        print('Training done !!')

        print(datetime.now(), 'logistics')
        logistic = self.logistic()
        print('Training done !!')
        
        return logistic,nb
        
    def save_model(self, model, path):
        with open(path, 'wb') as clf:
            pickle.dump(model, clf) 
              
              

In [12]:
from sklearn.model_selection import KFold, cross_val_score
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
models = Models(v_x_train,y_train,kfolds)
log,nb = models.main()

START Fit
2019-09-28 19:39:27.997897 NB
Training done !!
2019-09-28 19:39:28.003397 logistics
Training done !!
