# Setup

In [8]:
import re 
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from functools import cache
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import download as nltk_download

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer


MOVIE_DATA_LOCATION = Path('../data/movie_data.csv')

if not nltk_download('stopwords', quiet=True):
    raise Exception("Unable to download stopwords")

nltk_stop = stopwords.words('english')

tqdm.pandas(leave=False)

warnings.simplefilter('ignore')

## Data Preprocessing

In [2]:
if not MOVIE_DATA_LOCATION.is_file():
    ''' load and preprocess data if it hasn't been done already '''
    
    labels = {
        'pos' : 1,
        'neg' : 0
    }

    review_directories = ['test', 'train']

    df = []
    base_path = Path('../data/aclImdb/')
    with tqdm(total=50_000, desc = 'Progress', leave = False, position = 0) as bar:

        for review_directory in review_directories:
            for sentiment_directory in labels.keys():
                
                path = base_path / review_directory / sentiment_directory
                sentiment = labels[sentiment_directory]
                
                for file in sorted(path.iterdir()):
                    
                    with open(file, 'r', encoding='utf-8') as infile:
                        review = infile.read()
                        
                    df.append(
                        {'review' : review, 'sentiment' : sentiment}
                    )
                    
                    bar.update()
                    
    df = pd.json_normalize(df)

    ''' 
        Shuffle our movie data so class labels aren't sorted, so we can directly 
        stream the data from disk later on and have an easier time splitting
        our data into test/train sets
    '''
    np.random.seed(0)
    df = df.reindex(np.random.permutation(df.index))

    ''' save preprocessed data locally '''
    df.to_csv(MOVIE_DATA_LOCATION, index=False, encoding='utf-8')

df = pd.read_csv(MOVIE_DATA_LOCATION)

assert df.shape == (50_000, 2) # sanity check to make sure our data is the proper size

# IMDB Movie Review Sentiment Analysis

## Data Cleaning

In [3]:
def preprocessor(text : str) -> str :
    
    def _replace_non_alphanumeric(text : str) -> str :
        non_alnum_pattern = '[\W]+'
        return re.sub(non_alnum_pattern, ' ', text)
    
    def _replace_html_tags(text : str) -> str :
        html_tag_pattern = r'<[^>]*>'
        return re.sub(html_tag_pattern, '', text)
    
    def _rip_emoticons(text : str) -> tuple[str, list[str]] :  
        ''' extract then remove emoticons, as a :D or :P without removal creates extra characters '''
        emoticon_pattern = '(?::|;|=)(?:-)?(?:\)|\(|d|p)'
        emoticons = re.findall(emoticon_pattern, text, flags=re.IGNORECASE) # save emoticons as those can be useful for sentiment
        cleaned_text = re.sub(emoticon_pattern, '', text)
        emoticons = ' '.join(emoticons).replace('-', '') # remove nose character from emoticons for consistency and join them together
        return cleaned_text, emoticons
        
    text = text.lower()
    text = _replace_html_tags(text) # remove html markup tags
    text, emoticons = _rip_emoticons(text) # must extract emoticons before removing non alnums
    text = _replace_non_alphanumeric(text) # remove non words from our text after making it lower case
    
    '''
        since we are using a unigram approach, word order doesn't matter, so we can simply postpend our emoticons to the string.
        In a 1+n gram model, word order WOULD be important
    '''
    return text + emoticons # cleaned string

porter = PorterStemmer()

@cache
def _stem(text : str) -> str :
    return porter.stem(text)

def porter_tokenizer(text : str) -> list[str] :
    return [_stem(x) for x in text.split()]

def filter_stopwords(text : list[str]) -> list[str] :
    return [x for x in text if x not in nltk_stop]

def tokenizer(text : str) -> list[str] :
    return text.split()

df['preprocessed_review'] = df['review'].progress_map(preprocessor)

                                                        

## Creating train/test data

In [4]:
X_train = df.loc[:25_000, 'preprocessed_review'].values
y_train = df.loc[:25_000, 'sentiment'].values
X_test = df.loc[25_000:, 'preprocessed_review'].values
y_test = df.loc[25_000:, 'sentiment'].values

## Creating bag of words model

In [5]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

small_param_grid = [
    {
        'vect__ngram_range' :[(1,1)],
        'vect__stop_words'   : [None],
        'vect__tokenizer'    : [tokenizer, porter_tokenizer],
        'clf__penalty'       : ['l2'],
        'clf__C'              : [1.0, 10.0]
    },
    {
        'vect__ngram_range' :[(1,1)],
        'vect__stop_words'   : [nltk_stop, None],
        'vect__tokenizer'    : [tokenizer],
        'vect__use_idf'      : [False],
        'vect__norm'         : [None],
        'clf__penalty'       : ['l2'],
        'clf__C'              : [1.0, 10.0]
    }
]

lr_tfidf = Pipeline(
    [
        ('vect', tfidf),
        ('clf', LogisticRegression(solver='liblinear'))
    ]
)

gs_lr_tfidf = GridSearchCV(
    lr_tfidf, 
    small_param_grid, 
    scoring='accuracy', 
    cv=5, 
    verbose=1, 
    n_jobs=1
)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [6]:
print(f'Best parameter set: ', gs_lr_tfidf.best_params_)
print('Cross-fold Validation Accuracy: ', gs_lr_tfidf.best_score_)
print('Test Accuracy : ', gs_lr_tfidf.best_estimator_.score(X_test, y_test))

Best parameter set:  {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x0000017CA9AB3D80>}
Cross-fold Validation Accuracy:  0.8972442391521696
Test Accuracy :  0.89872


## Alternative for large data sets using file streaming

In [7]:
def stream_docs(path : Path) -> tuple[str, int] :
    ''' Reads one document at a time '''
    with open(path, 'r', encoding='utf-8') as infile:
        next(infile)
        
        for line in infile :
            review, sentiment = line[:-3], int(line[-2])
            yield review, sentiment

def get_minibatch(doc_stream, size : int) -> list[tuple[str, int]] :
    reviews, sentiments = [], []
    
    try:
        for _ in range(size):
            review, sentiment = next(doc_stream)
            reviews.append(review)
            sentiments.append(sentiment)
    except StopIteration:
        return None, None
    
    return reviews, sentiments

vectorizer = HashingVectorizer(
    decode_error='ignore',
    n_features=2**21,
    preprocessor=None,
    tokenizer= lambda x : filter_stopwords(tokenizer(x))
)

clf = SGDClassifier(loss='log_loss', random_state=1)

doc_stream = stream_docs(MOVIE_DATA_LOCATION)

classes = np.array([0, 1])

with tqdm(total=45, desc='Progress', leave=False, position=0) as bar:
    
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, 1000)
        if not X_train:
            break
        
        X_train = vectorizer.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
        
        bar.update()
        
X_test, y_test = get_minibatch(doc_stream, 5000)
X_test = vectorizer.transform(X_test)

print(f'Accuracy :', clf.score(X_test, y_test))

clf.partial_fit(X_test, y_test)

                                                         

Accuracy : 0.8532


# Topic Modeling with Latent Dirichlet Allocation (LDA)

In [10]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)

X = count_vectorizer.fit_transform(df['preprocessed_review'].values)

lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')

X_topics = lda.fit_transform(X)

lda.components_.shape

(10, 5000)

In [19]:
n_top_words = 5
feature_names = count_vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    
    top_feature_names = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
    
    print(f'Topic {topic_idx + 1}: ')
    for idx, top_feature in enumerate(top_feature_names, 1):
        print(f'  {idx}. {top_feature}')

Topic 1: 
  1. worst
  2. comedy
  3. script
  4. awful
  5. minutes
Topic 2: 
  1. women
  2. school
  3. sex
  4. girl
  5. girls
Topic 3: 
  1. music
  2. song
  3. songs
  4. musical
  5. dvd
Topic 4: 
  1. family
  2. father
  3. wife
  4. mother
  5. woman
Topic 5: 
  1. book
  2. read
  3. version
  4. original
  5. different
Topic 6: 
  1. series
  2. episode
  3. episodes
  4. tv
  5. season
Topic 7: 
  1. western
  2. john
  3. robert
  4. murder
  5. tom
Topic 8: 
  1. role
  2. performance
  3. actor
  4. performances
  5. john
Topic 9: 
  1. horror
  2. budget
  3. effects
  4. killer
  5. gore
Topic 10: 
  1. action
  2. game
  3. fight
  4. war
  5. hero
