In [2]:
import pandas as pd
import unidecode as un
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/leo/nltk_data...
[nltk_data] Downloading package stopwords to /home/leo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
df = pd.read_csv('IMDB_Dataset.csv')
df_old = df

### Preprocessing
We also need to make sure that the sentiments are only positive or negative (in case of faults in data)

We also need to remove HTML tags, due to the reviews being scraped from IMDB

We start by removing special letters etc. with unicode. This will change é to e á to a etc. 

After this we remove all special characters and make the comments clean. 

Removing stopwords

Lemmatizing the data

Finally we also remove unnecessary spaces

In [4]:
# Remove if sentiment is not positive or negative
mask = df['sentiment'].isin(['positive', 'negative'])
df = df[mask]

# Lablenize sentiment
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Remove html tags
df["review"] = df["review"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

# Fix decode and allowed_chars
allowed_chars = " abcdefghijklmnopqrstuvwxyz0123456789"
df["review"] = df["review"].apply(lambda x: un.unidecode(x).lower())
df["review"] = df["review"].apply(lambda x: ''.join([i for i in x if i in allowed_chars]))

# Remove stopwords
stop = stopwords.words('english')
df["review"] = df["review"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Lemmatize
lemmatizer = WordNetLemmatizer()
df["review"] = df["review"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))


# Strip unnecessary spaces
df["review"] = df["review"].apply(lambda x: x.strip())


  df["review"] = df["review"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())


### Split data
The data will be split into training and testing data as well as labels and data

In [20]:
train_size = 40000
train_data, train_labels = df['review'][:train_size], df['sentiment'][:train_size]
test_data, test_labels = list(df['review'][train_size:]), list(df['sentiment'][train_size:])

### Ngram
Create ngram representation of each word in the review

In [6]:
# ngram
c = CountVectorizer(min_df=0.0, max_df=1.0, binary=False, ngram_range=(1,3))
cv_train_reviews = c.fit_transform(train_data)
cv_test_reviews = c.transform(test_data)

print(f"{cv_train_reviews.shape = }")
print(f"{cv_test_reviews.shape = }")

cv_train_reviews.shape = (40000, 7144506)
cv_test_reviews.shape = (10000, 7144506)


### Training the model
We start by using a regression model, this is imported from sklear, and does not need very much work or knowledge.


In [8]:
lr = LogisticRegression(penalty="l2", max_iter=500, C=1, random_state=42)
lr_bow = lr.fit(cv_train_reviews, train_labels)

In [34]:
lr_Bow_predict = lr.predict(cv_test_reviews)

In [33]:
def accuracy_score(answer:list , predicted:list):
    """
    Compare each of the values in answer with predicted.
    Returns the accuracy
    """
    correct = 0
    for i in range(len(predicted)):
        if answer[i] == predicted[i]:
            correct += 1
    return correct/len(predicted)


In [35]:
lr_bow_score = accuracy_score(test_labels, list(lr_Bow_predict))
print(f"Score: {lr_bow_score}")

Score: 0.8998
