## Pipeline Functions

In [None]:
# Load dataset
def load_data():
    data = pd.read_csv("./data/reviews-dataset.csv")
    return data

In [None]:
# get preprocessed data
def get_preprocessed_data():
    data = load_data()
    return preprocess_data(data)

In [None]:
# Category wise Reviews
def category_reviews():
    data = load_data()
    return data.product_category.value_counts().to_dict()

In [None]:
def get_total_reviews():
    data = load_data()
    return data.shape[0]

In [None]:
# Remove stopwords
def remove_stopwords(tokens):
    en_stopwords = stopwords.words("English")
    en_stopwords.extend(['im','its','youre','thing','cant','dont','doesnt'])
    
    return [t for t in tokens if t not in en_stopwords]

In [None]:
# Extract adjectives
def extract_adj(tokens):
    adjectives = []
    for x in tokens:
        if x[1] in ['JJ','JJR','JJS']:
            adjectives.append(x[0])

    return adjectives

In [None]:
# Preprocessing steps
def preprocess_data(load_data()):
    #Convert the reviews into Lower Case
    data.product_review = data.product_review.str.lower()

    #Removing puctuations
    data.product_review = data.product_review.str.translate(str.maketrans('','',string.punctuation))

    #Tokenize words
    data['product_review_tokenized'] = data.product_review.apply(nltk.word_tokenize)

    #Remove stopwords
    data['cleaned_tokens'] = data.product_review_tokenized.apply(remove_stopwords)

    #Stingify cleaned tokens
    data['product_review_cleaned'] = data.cleaned_tokens.apply(lambda x: ' '.join(x))

    #POS Tagging
    data['POS_tokens'] = data.product_review_tokenized.apply(nltk.pos_tag)

    #Extract adjectives
    data['adjectives'] = data.POS_tokens.apply(extract_adj)

    return data

In [None]:
# Get word cloud
def get_wordcloud(category):
    data = get_preprocessed_data()
    adj = ""
    for x in data[data.product_category == category ].adjectives:
        adj += ' '.join(x) + ' '

    word_cloud = WordCloud(width=800, height=600, background_color='white').generate(adj)
    plt.imshow(word_cloud)
    plt.axis('off')
    plt.show()

In [None]:
#get sentiments
def polarity_score(review):
    # Initilizing the Sentiment Analyzer
    sent = SentimentIntensityAnalyzer()
   
    # Extracting the sentiment polarity scores of a review
    scores = sent.polarity_scores(review)
    
    # Getting the compound score
    compound = scores['compound']
    
    if compound > 0.05:
        return "positive"
    elif compound < -0.5:
        return "negative"
    else:
        return "neutral"

In [None]:
def get_sent_analysis():
    data = get_processed_data()

    # Label sentiments
    data['sentiment'] = data.product_review_cleaned.apply(polarity_score)

    # Get results
    df = data.groupby(["product_category","sentiment"]).size().reset_index(name="counts")
    return df.to_dict()

In [None]:
class pipeline:
    def __init__(self):
        self.data = pd.read_csv("./data/reviews-dataset.csv")

    def load_data():
        return self.data

    def category_reviews():
        return self.data.product_category.value_counts().to_dict()

    def remove_stopwords(tokens):
        en_stopwords = stopwords.words("English")
        en_stopwords.extend(['im','its','youre','thing','cant','dont','doesnt'])
        
        return [t for t in tokens if t not in en_stopwords]

    def preprocess_data():
        #Convert the reviews into Lower Case
        self.data.product_review = self.data.product_review.str.lower()
    
        #Removing puctuations
        self.data.product_review = self.data.product_review.str.translate(str.maketrans('','',string.punctuation))
    
        #Tokenize words
        self.data['product_review_tokenized'] = self.data.product_review.apply(nltk.word_tokenize)
    
        #Remove stopwords
        self.data['cleaned_tokens'] = self.data.product_review_tokenized.apply(remove_stopwords)
    
        #Stingify cleaned tokens
        self.data['product_review_cleaned'] = self.data.cleaned_tokens.apply(lambda x: ' '.join(x))
    
        return self