# Case 1: Predicting with texts


# Lexicon-based sentiment analysis


## Data importing and cleaning


In [2]:
import os
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import warnings
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from afinn import Afinn
# pip install afinn


warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
pd.set_option('display.max_colwidth', 50)
print("Current working directory: ", os.getcwd())
wd = os.getcwd()
# If current working directory is incorrect, use os.chdir("path")
# Edit the name of the data file from "Movie Review" into "MovieReview"
# and should be contained in the current working directory

Current working directory:  /Users/swislar/Desktop/IT1244/Project


In [42]:
def filterWords(TEXT):
    '''
    TEXT: String of the sentence
    RETURN: List of clean words
    '''
    # Remove HTML Tags and stop words
    stop_words = set(stopwords.words("english"))
    clean = BeautifulSoup(TEXT, "html.parser").get_text()
    words = [word.lower() for word in word_tokenize(
        clean) if word.isalnum() and word.lower() not in stop_words]
    return words


def loadAllData(WD=os.getcwd()):
    '''
    WD: Current working directory containing datasets- os.getcwd() 
    RETURN: Pandas Dataframe
    '''
    # Initialise the dataframe
    columns = ["id", "text"]
    rows = []
    posWD = os.path.join(WD, "MovieReview/data/pos")
    negWD = os.path.join(WD, "MovieReview/data/neg")
    # Read positively classified text
    for filename in tqdm(os.listdir(posWD), desc='Processing Positive Reviews'):
        with open(os.path.join(posWD, filename), "r") as file:
            posText = filterWords(file.read())
        # Strip .txt from filename and split ID and Rating as an integer
        posDetails = filename.rstrip(".txt").split("_")
        posID = int(posDetails[0])
        # Add data into the Dataframe
        row = {"id": posID, "text": posText}
        rows.append(row)
    # Repeat for negative text
    for filename in tqdm(os.listdir(negWD), desc='Processing Negative Reviews'):
        with open(os.path.join(negWD, filename), "r") as file:
            negText = filterWords(file.read())
        negDetails = filename.rstrip(".txt").split("_")
        # Offset id by 25000 to prevent overlaps
        negID = int(negDetails[0]) + 25000
        row = {"id": negID, "text": negText}
        rows.append(row)
    return pd.DataFrame(rows, columns=columns)

In [43]:
dataframe = loadAllData()

Processing Positive Reviews:   0%|          | 0/25000 [00:00<?, ?it/s]

Processing Positive Reviews: 100%|██████████| 25000/25000 [00:26<00:00, 938.87it/s]
Processing Negative Reviews: 100%|██████████| 25000/25000 [00:27<00:00, 919.86it/s]

<class 'list'>





## Main


In [48]:
# Lexicon-based sentiment analysis
afinn = Afinn()

# Takes approximately 1min to load
dataframe["sentiment_score"] = dataframe["text"].apply(
    lambda row: sum(afinn.score(word) for word in row))

# Classify sentiment based on the total score
dataframe["sentiment"] = dataframe["sentiment_score"].apply(
    lambda res: 1 if (res > 0) else 0)

0    [watched, return, lonesome, dove, good, seen, ...
1    [movie, looked, like, classic, cheesy, 80s, sl...
2    [jay, chou, plays, orphan, raised, kung, fu, s...
3    [ooverall, movie, fairly, good, good, action, ...
4    [movie, fun, watch, liked, dave, kevin, klein,...
Name: text, dtype: object
0    1
1    0
2    1
3    1
4    0
Name: sentiment, dtype: int64


In [68]:
dataframe["actual_sentiment"] = dataframe["id"].apply(
    lambda id: 1 if id <= 25000 else 0)
dataframe.head(5)

Unnamed: 0,id,text,sentiment_score,sentiment,actual_sentiment
0,20935,"[watched, return, lonesome, dove, good, seen, ...",10.0,1,1
1,12390,"[movie, looked, like, classic, cheesy, 80s, sl...",-19.0,0,1
2,9820,"[jay, chou, plays, orphan, raised, kung, fu, s...",30.0,1,1
3,883,"[ooverall, movie, fairly, good, good, action, ...",2.0,1,1
4,9063,"[movie, fun, watch, liked, dave, kevin, klein,...",0.0,0,1


In [67]:
acc = sum(dataframe["sentiment"] ==
          dataframe["actual_sentiment"])/dataframe.shape[0]
print(f'The accuracy of using the Lexicon-based sentiment analysis is {acc}')

The accuracy of using the Lexicon-based sentiment analysis is 0.70176


# Kmeans


## Data importing and cleaning


In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
pd.set_option('display.max_colwidth', 50)
print("Current working directory: ", os.getcwd())
wd = os.getcwd()
# If current working directory is incorrect, use os.chdir("path")
# Edit the name of the data file from "Movie Review" into "MovieReview"
# and should be contained in the current working directory

Current working directory:  /Users/swislar/Desktop/IT1244/Project


In [2]:
def filterWords(TEXT):
    '''
    TEXT: String of the sentence
    RETURN: String of clean sentence
    '''
    # Remove HTML Tags and stop words
    clean = BeautifulSoup(TEXT, "html.parser").get_text()
    return str(clean)


def loadAllData(WD=os.getcwd()):
    '''
    WD: Current working directory containing datasets- os.getcwd() 
    RETURN: Pandas Dataframe
    '''
    # Initialise the dataframe
    columns = ["id", "text"]
    rows = []
    posWD = os.path.join(WD, "MovieReview/data/pos")
    negWD = os.path.join(WD, "MovieReview/data/neg")
    # Read positively classified text
    for filename in tqdm(os.listdir(posWD), desc='Processing Positive Reviews'):
        with open(os.path.join(posWD, filename), "r") as file:
            posText = filterWords(file.read())
        # Strip .txt from filename and split ID and Rating as an integer
        posDetails = filename.rstrip(".txt").split("_")
        posID = int(posDetails[0])
        # Add data into the Dataframe
        row = {"id": posID, "text": posText}
        rows.append(row)
    # Repeat for negative text
    for filename in tqdm(os.listdir(negWD), desc='Processing Negative Reviews'):
        with open(os.path.join(negWD, filename), "r") as file:
            negText = filterWords(file.read())
        negDetails = filename.rstrip(".txt").split("_")
        # Offset id by 25000 to prevent overlaps
        negID = int(negDetails[0]) + 25000
        row = {"id": negID, "text": negText}
        rows.append(row)
    return pd.DataFrame(rows, columns=columns)

In [3]:
dataframe = loadAllData()

Processing Positive Reviews:   0%|          | 0/25000 [00:00<?, ?it/s]

Processing Positive Reviews: 100%|██████████| 25000/25000 [00:02<00:00, 11137.04it/s]
Processing Negative Reviews: 100%|██████████| 25000/25000 [00:02<00:00, 10630.62it/s]


# References


1. Unsupervised Sentiment Analysis: Extracting Insights From Unlabeled Data </br>
   <a>https://www.zonkafeedback.com/blog/unsupervised-sentiment-analysis#:~:text=Unsupervised%20sentiment%20analysis%20helps%20you,reviews)%20to%20train%20a%20model.</a>
2. Lexicon-based sentiment analysis: What it is & how to conduct one </br>
   https://www.knime.com/blog/lexicon-based-sentiment-analysis
