In [None]:
import re
import emoji
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from unicodedata import normalize
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gaborro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/gaborro/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gaborro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def preprocess_text(text):
        text = re.sub(r'http\S+|www\S+|@\w+|#', '', text)
        text = normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        text = text.lower()
        text = emoji.replace_emoji(text, replace='')
        text = re.sub(r'[^a-z\s]', '', text)
        text = text.strip()
        return text

In [4]:
data = pd.read_csv("train.csv", header=None)
data

Unnamed: 0,0,1,2
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


In [5]:
data.rename(columns={0:"polarity", 1:"title", 2:"review"}, inplace=True)
data

Unnamed: 0,polarity,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


In [6]:
data.dropna(subset=["title"], inplace=True)

In [7]:
data = data.astype({"title":str, "review":str})

In [8]:
data

Unnamed: 0,polarity,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


In [9]:
data["clean_title"] = data["title"].apply(preprocess_text)

In [10]:
data.drop(columns=["title"], inplace=True)

In [11]:
data

Unnamed: 0,polarity,review,clean_title
0,2,This sound track was beautiful! It paints the ...,stuning even for the nongamer
1,2,I'm reading a lot of reviews saying that this ...,the best soundtrack ever to anything
2,2,This soundtrack is my favorite music of all ti...,amazing
3,2,I truly like this soundtrack and I enjoy video...,excellent soundtrack
4,2,"If you've played the game, you know how divine...",remember pull your jaw off the floor after hea...
...,...,...,...
3599995,1,The high chair looks great when it first comes...,dont do it
3599996,1,I have used this highchair for 2 kids now and ...,looks nice low functionality
3599997,1,"We have a small house, and really wanted two o...",compact but hard to clean
3599998,1,not sure what this book is supposed to be. It ...,what is it saying


In [12]:
data.to_csv("clean_titles.csv")

------------------------------

In [2]:
data = pd.read_csv("clean_titles.csv")
data

Unnamed: 0.1,Unnamed: 0,polarity,review,clean_title
0,0,2,This sound track was beautiful! It paints the ...,stuning even for the nongamer
1,1,2,I'm reading a lot of reviews saying that this ...,the best soundtrack ever to anything
2,2,2,This soundtrack is my favorite music of all ti...,amazing
3,3,2,I truly like this soundtrack and I enjoy video...,excellent soundtrack
4,4,2,"If you've played the game, you know how divine...",remember pull your jaw off the floor after hea...
...,...,...,...,...
3599788,3599995,1,The high chair looks great when it first comes...,dont do it
3599789,3599996,1,I have used this highchair for 2 kids now and ...,looks nice low functionality
3599790,3599997,1,"We have a small house, and really wanted two o...",compact but hard to clean
3599791,3599998,1,not sure what this book is supposed to be. It ...,what is it saying


In [3]:
data.dropna(subset=["clean_title"], inplace=True)

In [4]:
data = data.astype({"clean_title":str})

In [5]:
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

In [8]:
def get_antonym(word):
    antonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.extend(lemma.antonyms())
    if antonyms:
        return antonyms[0].name()  # Take the first antonym
    return word  # Return original if no antonym is found

In [9]:
def get_word_polarity(word):
    return sia.polarity_scores(word)["compound"]

In [11]:
def invert_review_polarity(review, review_polarity, polarity_threshold=0.1):
    words = review.split()  # Split review into individual words
    inverted_words = []

    for word in words:
        word_polarity = get_word_polarity(word)

        # Check if the word's polarity aligns with the review's overall polarity
        if (review_polarity == 2 and word_polarity > polarity_threshold) or \
           (review_polarity == 1 and word_polarity < -polarity_threshold):
            # Replace word with its antonym if polarity matches and threshold is exceeded
            antonym = get_antonym(word)
            if antonym:
                inverted_words.append(antonym)
            else:
                inverted_words.append(word)  # Keep the original word if no antonym exists
        else:
            inverted_words.append(word)  # Keep neutral or opposite-polarity words unchanged

    return " ".join(inverted_words)  # Reconstruct the inverted review

In [12]:
data["inverse_title"] = data.apply(lambda row: invert_review_polarity(row["clean_title"], row["polarity"]), axis=1)
data

Unnamed: 0.1,Unnamed: 0,polarity,review,clean_title,inverse_title
0,0,2,This sound track was beautiful! It paints the ...,stuning even for the nongamer,stuning even for the nongamer
1,1,2,I'm reading a lot of reviews saying that this ...,the best soundtrack ever to anything,the worst soundtrack ever to anything
2,2,2,This soundtrack is my favorite music of all ti...,amazing,amazing
3,3,2,I truly like this soundtrack and I enjoy video...,excellent soundtrack,excellent soundtrack
4,4,2,"If you've played the game, you know how divine...",remember pull your jaw off the floor after hea...,remember pull your jaw off the floor after hea...
...,...,...,...,...,...
3599788,3599995,1,The high chair looks great when it first comes...,dont do it,dont do it
3599789,3599996,1,I have used this highchair for 2 kids now and ...,looks nice low functionality,looks nice high functionality
3599790,3599997,1,"We have a small house, and really wanted two o...",compact but hard to clean,compact but easy to clean
3599791,3599998,1,not sure what this book is supposed to be. It ...,what is it saying,what is it saying


In [13]:
data.drop(columns=["Unnamed: 0"], inplace=True)
data

Unnamed: 0,polarity,review,clean_title,inverse_title
0,2,This sound track was beautiful! It paints the ...,stuning even for the nongamer,stuning even for the nongamer
1,2,I'm reading a lot of reviews saying that this ...,the best soundtrack ever to anything,the worst soundtrack ever to anything
2,2,This soundtrack is my favorite music of all ti...,amazing,amazing
3,2,I truly like this soundtrack and I enjoy video...,excellent soundtrack,excellent soundtrack
4,2,"If you've played the game, you know how divine...",remember pull your jaw off the floor after hea...,remember pull your jaw off the floor after hea...
...,...,...,...,...
3599788,1,The high chair looks great when it first comes...,dont do it,dont do it
3599789,1,I have used this highchair for 2 kids now and ...,looks nice low functionality,looks nice high functionality
3599790,1,"We have a small house, and really wanted two o...",compact but hard to clean,compact but easy to clean
3599791,1,not sure what this book is supposed to be. It ...,what is it saying,what is it saying


In [14]:
data.drop(columns=["review"], inplace=True)
data

Unnamed: 0,polarity,clean_title,inverse_title
0,2,stuning even for the nongamer,stuning even for the nongamer
1,2,the best soundtrack ever to anything,the worst soundtrack ever to anything
2,2,amazing,amazing
3,2,excellent soundtrack,excellent soundtrack
4,2,remember pull your jaw off the floor after hea...,remember pull your jaw off the floor after hea...
...,...,...,...
3599788,1,dont do it,dont do it
3599789,1,looks nice low functionality,looks nice high functionality
3599790,1,compact but hard to clean,compact but easy to clean
3599791,1,what is it saying,what is it saying


In [15]:
data_relevant = data[~(data['clean_title'] == data['inverse_title'])]

In [16]:
data_relevant

Unnamed: 0,polarity,clean_title,inverse_title
1,2,the best soundtrack ever to anything,the worst soundtrack ever to anything
7,2,glorious story,inglorious story
10,1,the worst,the best
14,1,awful beyond belief,nice beyond belief
16,2,a romantic zen baseball comedy,a classicist zen baseball tragedy
...,...,...,...
3599772,2,a sweet scent,a sour scent
3599778,2,useful for everything boat related,useless for everything boat related
3599781,2,we love tyler,we hate tyler
3599789,1,looks nice low functionality,looks nice high functionality


In [17]:
data_relevant.drop(columns=["polarity"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant.drop(columns=["polarity"], inplace=True)


In [18]:
data_relevant

Unnamed: 0,clean_title,inverse_title
1,the best soundtrack ever to anything,the worst soundtrack ever to anything
7,glorious story,inglorious story
10,the worst,the best
14,awful beyond belief,nice beyond belief
16,a romantic zen baseball comedy,a classicist zen baseball tragedy
...,...,...
3599772,a sweet scent,a sour scent
3599778,useful for everything boat related,useless for everything boat related
3599781,we love tyler,we hate tyler
3599789,looks nice low functionality,looks nice high functionality


In [19]:
data_relevant.rename(columns={"clean_title":"input", "inverse_title":"output"})

Unnamed: 0,input,output
1,the best soundtrack ever to anything,the worst soundtrack ever to anything
7,glorious story,inglorious story
10,the worst,the best
14,awful beyond belief,nice beyond belief
16,a romantic zen baseball comedy,a classicist zen baseball tragedy
...,...,...
3599772,a sweet scent,a sour scent
3599778,useful for everything boat related,useless for everything boat related
3599781,we love tyler,we hate tyler
3599789,looks nice low functionality,looks nice high functionality


In [20]:
data_relevant.to_csv("training_data.csv")