In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import joblib

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
linear_model = joblib.load('final_model/RF.joblib')

lemmatizer = WordNetLemmatizer()

In [4]:
# function to remove HTML elements from HTML
def removeHTML(raw_text):
    try:
        clean_HTML = BeautifulSoup(raw_text).get_text() 
        return clean_HTML
    except: 
        return raw_text
    
# function to remove special characters and numbers from the reviews
def remove_special_char(raw_text):
    clean_special_char = re.sub("[^a-zA-Z]", " ", raw_text)  
    return clean_special_char

# function to convert all reviews into lower case
def toLowerCase(raw_text):
    clean_lowerCase = raw_text.lower().split()
    return(" ".join(clean_lowerCase))

def get_wordnet_pos(tag):
    from nltk.corpus import wordnet
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else: 
        return wordnet.NOUN

# function that converts english words to its base form (lemmatize)
def lemmatize(sentance):
    tagged = pos_tag( [i for i in sentance if i])
    lemmatized = []
    for word, tag in tagged:
        lemma = lemmatizer.lemmatize(word, pos = get_wordnet_pos(tag))
        lemmatized.append(lemma)
    return lemmatized

# function to remove stop words from the reviews
def remove_stop_words(raw_text):
    stops = set(stopwords.words("english"))
    words = [w for w in raw_text if not w in stops]
    return words

# function which combines all above operations to a single function
def customTokenizer(text):
    text = removeHTML(text)
    text = remove_special_char(text)
    text = toLowerCase(text)
    tokens = word_tokenize(text)
    tokens = remove_stop_words(tokens)
    tokens = lemmatize(tokens)
    return tokens

def get_sentiment_score(df:pd.DataFrame):
    # df.reset_index(inplace=True)
    df['likes'].fillna(0, inplace=True)
    df['helpful'] = df['likes'].apply(lambda n: n+1)
    sum = df['helpful'].sum()

    result = {
        0: 0,
        1: 0,
        2: 0
    }

    for index, sentiment  in enumerate(df['sentiment']):
        result[sentiment] += df['helpful'][index]

    score = (-1 * result[0] + 1*result[2]) / sum
        
    return score

In [5]:
tf_idf_vectorizer = joblib.load('final_model/TVEC.joblib')

In [6]:
def get_sentiment(df:pd.DataFrame):
    sparse_matrix = tf_idf_vectorizer.transform(df['content'])
    df['sentiment'] = linear_model.predict(sparse_matrix)

    return df

In [24]:
# function to get weekwise sentiment scores for a dataframe containing multiple product
# all reviews of a product must have same id
# input: Dataframe with features: 'id', 'time', 'price', 'source', 'likes', 'helpful'
# output: dataframe with sentiment scores for product reviews grouped by weeks

def get_weekwise_sentiment_scores(df: pd.DataFrame):
    grouped_products = df.groupby(["id"])

    result_df = pd.DataFrame({"id": [], "week": [], "score": [], "price": []})
    for _, product in grouped_products:
        product = product.sort_values("date").reset_index()
        date_ranges = pd.date_range(
            start=product["date"].min(), end=product["date"].max(), freq="7D"
        )
        week_groups = []
        for i in range(len(date_ranges) - 1):
            start_date = date_ranges[i]
            end_date = date_ranges[i + 1]
            
            temp = product[
                (product["date"] >= start_date) & (product["date"] < end_date)
            ]
            week_groups.append(temp)

        price = product.loc[0,'price']
        if price == 0:
            for _, row in product.iterrows():
                if row["source"] == "flipkart":
                    price = row["price"]
                    break

        id = int(product.loc[0,"id"])

        week = 1
        for group in week_groups:
            if len(group) == 0:
                continue
            group = group.reset_index()
            score = get_sentiment_score(group)
            new_row = [{"id": id, "week": week, "score": score, "price": price}]
            result_df = pd.concat([result_df, pd.DataFrame.from_dict(new_row)], ignore_index=True)
            week += 1
    return result_df

In [55]:
df = pd.read_csv('scraped_dataset/phone5.csv').dropna()
df['date'] = pd.to_datetime(df['time'], dayfirst=True)

In [56]:
df = get_sentiment(df)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


In [57]:
df = get_weekwise_sentiment_scores(df)

In [58]:
df[df['price'] == 0]

Unnamed: 0,id,week,score,price


In [59]:
df.to_csv("scraped_dataset/phone5_processed.csv", index=False)