In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/Meiji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Meiji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Meiji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Data

In [None]:
wiki_movie_plots = pd.read_csv('wiki_movie_plots_deduped.csv')
wiki_movie_plots.head()

In [None]:
all_genres = list(wiki_movie_plots['Genre'].unique())
print('>> Number of raw genres:', len(all_genres))
print('>> Number of raw rows:', wiki_movie_plots.shape[0])

# Preprocessing genres

In [None]:
ALLOWED_GENRES = {'action', 'adventure', 'animated', 'biographical', 'comedy', 'crime',
                  'drama', 'fantasy', 'history', 'horror', 'music', 'mystery',
                  'romance', 'scifi', 'sport', 'thriller', 'war', 'western', 'documentary'}

In [None]:
def preprocess_genres(genre_string):
    
    """
    Processes a single genre string by first splitting it on non-hyphen non-alphanumeric characters, applying
    predefined replacements to standardize genre names, splitting again by hyphens, and then filtering genres
    against a predefined list. Any genre not in the allowed list is classified as "other".

    Parameters:
    - genre_string (str): A string containing multiple genre descriptions which may include separators like commas or slashes.

    Returns:
    - str: A comma-separated sorted list of unique genre names standardized according to a predefined list of allowed genres.
            Unrecognized genres are labeled as "other".

    This function ensures that genre names are consistent and categorized, facilitating easier analysis and usage in
    data processing tasks.

    Example:
    genre_string = "sci-fi, romantic-comedy, epic-war, unknown style, crime-drama"
    result = preprocess_genres(genre_string)
    print(result)  # Output might include 'scifi', 'romance', 'war', 'other', etc., based on the input and settings.
    """
    
    replacements = {
        'biodrama': 'biographical-drama',
        'docudrama': 'documentary-drama',
        'melodrama': 'drama',
        'sci-fi': 'scifi',
        'science-fiction': 'scifi',
        'science fiction': 'scifi',
        'rom com': 'romance-comedy',
        'romcom': 'romance-comedy',
        'rom-com': 'romance-comedy',
        'romantic comedy': 'romance-comedy',
        'romantic': 'romance',
        'rom-comedy': 'romance-comedy',
        'bio': 'biographical',
        'biographic': 'biographical',
        'biography': 'biographical',
        'anime': 'animated',
        'animation': 'animated'
    }

    # Convert to lowercase and perform initial splitting
    genres = re.split(r'[^a-zA-Z0-9\-]+', genre_string.lower())

    processed_genres = set()
    for genre in genres:
        # Apply replacements
        for old, new in replacements.items():
            genre = re.sub(r'\b{}\b'.format(re.escape(old)), new, genre)
        
        # Split by hyphens and validate genres
        subgenres = genre.split('-')
        for subgenre in subgenres:
            clean_subgenre = subgenre.strip()
            if clean_subgenre in ALLOWED_GENRES:
                processed_genres.add(clean_subgenre)
            elif clean_subgenre:
                processed_genres.add('other')

    return ','.join(sorted(processed_genres))

In [None]:
# Apply the preprocessing
genres_processed = pd.Series(wiki_movie_plots['Genre']).apply(preprocess_genres)
print(genres_processed)

# Restructure Data

In [None]:
# Create a 0-1 binary column for each genre in the list, and append back to the dataframe
wiki_movie_plots_processed = pd.concat([wiki_movie_plots, genres_processed.str.get_dummies(sep=',')], axis=1)
wiki_movie_plots_processed.head()

In [None]:
# Filter rows with at least 1 specific (non-other) genre, and remove the other column.
# Filtering logic: not(other==1 and row_sum(all genre columns)==1)

# rows_with_other_genre_only = genres[(genres['other'] == 1) & (genres.sum(axis=1)==1)]
wiki_movie_plots_processed = wiki_movie_plots_processed[~((wiki_movie_plots_processed['other'] == 1) & (wiki_movie_plots_processed.iloc[:,8:].sum(axis=1)==1))].drop('other', axis=1)

In [None]:
print('>> Number of raw rows:', wiki_movie_plots.shape[0])
print('>> Number of processed rows:', wiki_movie_plots_processed.shape[0])
print('>> Processed rows %:', wiki_movie_plots_processed.shape[0]/wiki_movie_plots.shape[0])

## Save as 'data_processed_genres.csv'

In [11]:
wiki_movie_plots_processed.to_csv('data_processed_genres.csv', index=False)

# Check multi-labels

In [13]:
# Number of n-label rows
label_count = wiki_movie_plots_processed.iloc[:,8:].sum(axis=1).value_counts()
label_count

1    20602
2     4694
3      536
4      111
0       31
5       17
6        1
7        1
Name: count, dtype: int64

In [None]:
print('>> % multi-label:', 1-((label_count[1]+label_count[0])/sum(label_count)))

# Preprocessing plots

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Load stopwords
stop_words = set(stopwords.words('english'))

def preprocess_plot(plot):
    if isinstance(plot, float):
        # Handle NaN or non-string plot descriptions
        return ""
    
    # Tokenize the plot
    tokens = word_tokenize(plot)
    
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    processed_plot = ' '.join(tokens)
    
    return processed_plot

movie_data = wiki_movie_plots_processed

# Apply preprocessing to the 'Plot' column
movie_data['Processed_Plot'] = movie_data['Plot'].apply(preprocess_plot)

# Display the first few rows of the dataset to confirm preprocessing
print(movie_data[['Plot', 'Processed_Plot']].head())

## Save as 'preprocessed_movie_plots.xlsx'

In [None]:
# Save the preprocessed data to a new file
output_file_path = 'preprocessed_movie_plots.xlsx'
movie_data.to_excel(output_file_path, index=False)
print(f"Preprocessed data saved to {output_file_path}")