In [7]:
import os
os.chdir('C:/Users/Ronny/Documents/CAP/Functions')

### Reading data

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split


def read_data(filename, review_column, rating_column):
    """ 
        **Details of the <read_data> function**
        The purpose of this function is to read a csv file into dataframe and perform stratified sampling of one column
        based on another.
        
        Inputs : It takes 3 parameters.
                    1. filename or path of the csv file
                    2. column name containing the review text data
                    3. column name containing the review rating data

        Output : It will return two dataframes as output.
                    1. sample dataframe
                    2. test dataframe
    """
    
    review_df = pd.read_csv(filename)
    
    # Separating the Review text and the review rating columns
    review_data = review_df[[review_column]]
    rating_data = review_df[[rating_column]]
    
    # Stratified sampling for train and test data based on "Review Rating" (i.e. 1 to 5)
    sample, test, _1, _2 = train_test_split( review_data, rating_data, test_size=0.33, random_state=42, stratify=rating_data)
    
    return sample, test



In [21]:
sample, test = read_data('C:/Users/Ronny/Documents/CAP/data/reviews.csv','Review Text', 'Review Rating')

In [22]:
sample.head(10)

Unnamed: 0,Review Text
8640,I like so much .....
8803,It's very nice
8998,Don't check the reviews just go for it
9959,Flagship Phone is here... Superb...
7667,Excellent purchase.
4372,Camera\nSound\nBattery\nSpeed\nAnd descent bod...
4291,Best picture quality and sound quality with th...
2677,Hang too much.. Not worth expectation.. Could ...
2297,Great
7518,It is a beast. And I got this at 27k + added b...


### Data Cleaning, Noun and Adjective extraction

In [63]:
import spacy
import pandas as pd
nlp = spacy.load('en')
import inflect
inflect = inflect.engine()

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

# Function to get the part of speech tagging for a word
def get_pos(tok):
    return tok.pos_ if ((str(tok) not in ["rear","front","back","sound","backup","mobile"])) \
           else "NOUN" # <- Return whatever POS tag you'd like. 

# Function to extract singular nouns
def singular(noun):
    noun=str(noun)
    if inflect.singular_noun(noun)==False:
        return (noun)
    else: return(inflect.singular_noun(noun))

def lemmatize(word):
    return lemmatizer(word, u'VERB')[0]

def noun_lemma(review):
    noun_sents=[]
    parse=nlp(review.lower())
    for i in parse.sents:
        sentence=nlp(str(i))
        noun_sents.append([j.lower_ for j in sentence if ((get_pos(j)=="NOUN") and (j.is_alpha) and (j.tag_!="WP"))])
    flatted_noun =list(set([y for x in noun_sents for y in x]))
    lemma_noun=list(map(lemmatize,flatted_noun))
    single_noun=list(map(singular,lemma_noun))
    size_noun=list(set([x for x in single_noun if len(x)>2]))
    return(','.join(size_noun))


def create_transaction(review_text_data):
    review=review_text_data
    review_text=review.loc[:,"Review Text"]
    transaction_doc=pd.DataFrame(columns=["Review_text","Nouns"])
    transaction_doc["Review_text"]=review_text
    transaction_doc["Nouns"]=transaction_doc["Review_text"].apply(noun_lemma)
    transaction_doc.to_csv("transaction_doc.csv")
    return(transaction_doc)

    

In [64]:
transaction_data = create_transaction(sample)

In [65]:
# Importing python packages

import pandas as pd

# Importing mlxtend packages for sparse matrix & apriori

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


def apriori_func(noun_list,max_len):
    """

    ** Details of the Apriori function **

    The purpose of this function is to read the noun list and calculate the support for one and two nouns.

    Inputs: It takes one input.
            1. noun list dataframe

    Outputs: It returns one output dataframe.
            1. noun list with sorted support values.

    """
    # Dropping the row which has no nouns and resetting the index
    if 'Unnamed: 0' in noun_list.columns:
        noun_list.drop(axis=1,columns='Unnamed: 0',inplace=True)
    if noun_list.Nouns.isnull().sum()>0:
        noun_list = noun_list.dropna()
    elif (noun_list.Nouns == '').sum()>0:
        noun_list = noun_list.loc[noun_list.Nouns != '',:]
    else:
        pass
    noun_list.set_axis(range(noun_list.shape[0]),inplace=True)

    # Preparing the dataframe as list of list for the sparse matrix
    df_list = []
    for i in range(noun_list.shape[0]):
        df_list.append(str(noun_list.loc[i,'Nouns']).split(','))

    #print(df_list[0:20])

    te = TransactionEncoder()
    te_ary1 = te.fit(df_list).transform(df_list)
    df2 = pd.DataFrame(te_ary1, columns=te.columns_)	

    #print(df2.head(5))

    apr = apriori(df2, min_support=0.01, use_colnames=True, max_len=max_len)
    sorted_apr = apr.sort_values(['support'], ascending=False)

    # print(sorted_apr)

    temp=[]
    sorted_apr['itemsets']=set(sorted_apr['itemsets'])
    for i in sorted_apr['itemsets']:
        temp.append(list(set(i)))
    sorted_apr['itemsets']=temp
    sorted_apr.to_csv("sorted_apriori.csv")

    return sorted_apr


In [66]:
app = apriori_func(transaction_data,1)

In [67]:
len(app)

79

In [49]:
app1 = apriori_func(transaction_data,2)

In [50]:
len(app1)

193