In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhenglongluo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhenglongluo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhenglongluo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Data

In [5]:
wiki_movie_plots = pd.read_csv('wiki_movie_plots_deduped.csv')
wiki_movie_plots.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [6]:
all_genres = list(wiki_movie_plots['Genre'].unique())
print('>> Number of raw genres:', len(all_genres))
print('>> Number of raw rows:', wiki_movie_plots.shape[0])

>> Number of raw genres: 2265
>> Number of raw rows: 34886


# Preprocessing genres

In [7]:
ALLOWED_GENRES = {'action', 'adventure', 'animated', 'biographical', 'comedy', 'crime',
                  'drama', 'fantasy', 'history', 'horror', 'music', 'mystery',
                  'romance', 'scifi', 'sport', 'thriller', 'war', 'western', 'documentary'}

In [8]:
def preprocess_genres(genre_string):
    
    """
    Processes a single genre string by first splitting it on non-hyphen non-alphanumeric characters, applying
    predefined replacements to standardize genre names, splitting again by hyphens, and then filtering genres
    against a predefined list. Any genre not in the allowed list is classified as "other".

    Parameters:
    - genre_string (str): A string containing multiple genre descriptions which may include separators like commas or slashes.

    Returns:
    - str: A comma-separated sorted list of unique genre names standardized according to a predefined list of allowed genres.
            Unrecognized genres are labeled as "other".

    This function ensures that genre names are consistent and categorized, facilitating easier analysis and usage in
    data processing tasks.

    Example:
    genre_string = "sci-fi, romantic-comedy, epic-war, unknown style, crime-drama"
    result = preprocess_genres(genre_string)
    print(result)  # Output might include 'scifi', 'romance', 'war', 'other', etc., based on the input and settings.
    """
    
    replacements = {
        'biodrama': 'biographical-drama',
        'docudrama': 'documentary-drama',
        'melodrama': 'drama',
        'sci-fi': 'scifi',
        'science-fiction': 'scifi',
        'science fiction': 'scifi',
        'rom com': 'romance-comedy',
        'romcom': 'romance-comedy',
        'rom-com': 'romance-comedy',
        'romantic comedy': 'romance-comedy',
        'romantic': 'romance',
        'rom-comedy': 'romance-comedy',
        'bio': 'biographical',
        'biographic': 'biographical',
        'biography': 'biographical',
        'anime': 'animated',
        'animation': 'animated'
    }

    # Convert to lowercase and perform initial splitting
    genres = re.split(r'[^a-zA-Z0-9\-]+', genre_string.lower())

    processed_genres = set()
    for genre in genres:
        # Apply replacements
        for old, new in replacements.items():
            genre = re.sub(r'\b{}\b'.format(re.escape(old)), new, genre)
        
        # Split by hyphens and validate genres
        subgenres = genre.split('-')
        for subgenre in subgenres:
            clean_subgenre = subgenre.strip()
            if clean_subgenre in ALLOWED_GENRES:
                processed_genres.add(clean_subgenre)
            elif clean_subgenre:
                processed_genres.add('other')

    return ','.join(sorted(processed_genres))

In [9]:
# Apply the preprocessing
genres_processed = pd.Series(wiki_movie_plots['Genre']).apply(preprocess_genres)
print(genres_processed)

0                 other
1                 other
2                 other
3                 other
4                 other
              ...      
34881             other
34882            comedy
34883            comedy
34884    comedy,romance
34885           romance
Name: Genre, Length: 34886, dtype: object


In [10]:
# Create a 0-1 binary column for each genre in the list, and append back to the dataframe
wiki_movie_plots_processed = pd.concat([wiki_movie_plots, genres_processed.str.get_dummies(sep=',')], axis=1)
wiki_movie_plots_processed.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,action,adventure,...,horror,music,mystery,other,romance,scifi,sport,thriller,war,western
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",0,0,...,0,0,0,1,0,0,0,0,0,0
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",0,0,...,0,0,0,1,0,0,0,0,0,0
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",0,0,...,0,0,0,1,0,0,0,0,0,0
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,0,0,...,0,0,0,1,0,0,0,0,0,0


In [11]:
# Filter rows with at least 1 specific (non-other) genre, and remove the other column.

# Filtering logic: not(other==1 and row_sum(all genre columns)==1)

# rows_with_other_genre_only = genres[(genres['other'] == 1) & (genres.sum(axis=1)==1)]
wiki_movie_plots_processed = wiki_movie_plots_processed[~((wiki_movie_plots_processed['other'] == 1) & (wiki_movie_plots_processed.iloc[:,8:].sum(axis=1)==1))].drop('other', axis=1)

In [12]:
print('>> Number of raw rows:', wiki_movie_plots.shape[0])
print('>> Number of processed rows:', wiki_movie_plots_processed.shape[0])

print(25993/34886)

>> Number of raw rows: 34886
>> Number of processed rows: 25993
0.7450839878461274


In [13]:
# Save
wiki_movie_plots_processed.to_csv('data_processed_genres.csv', index=False)

In [14]:
wiki_movie_plots = pd.read_csv('data_processed_genres.csv')
wiki_movie_plots_head = wiki_movie_plots.head(35000)
wiki_movie_plots_head 


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,action,adventure,...,history,horror,music,mystery,romance,scifi,sport,thriller,war,western
0,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...,1,0,...,0,0,0,0,0,0,0,0,0,1
3,1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(19...,Boone's daughter befriends an Indian maiden as...,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25988,2013,Particle (film),Turkish,Erdem Tepegöz,"Jale Arıkan, Rüçhan Caliskur, Özay Fecht, Remz...",drama film,https://en.wikipedia.org/wiki/Particle_(film),"Zeynep lost her job at weaving factory, and he...",0,0,...,0,0,0,0,0,0,0,0,0,0
25989,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the...",0,0,...,0,0,0,0,0,0,0,0,0,0
25990,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i...",0,0,...,0,0,0,0,0,0,0,0,0,0
25991,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...,0,0,...,0,0,0,0,1,0,0,0,0,0


Second process

In [15]:
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

wiki_movie_plots_head['Processed_Plot'] = wiki_movie_plots_head['Plot'].apply(preprocess_text)

print(wiki_movie_plots_head[['Plot', 'Processed_Plot']].head())

# Save the updated dataframe to a new CSV file
output_path = '/Users/zhenglongluo/Dropbox/MBS/Module 3/Text Analytics for Business/Project/Data/wiki_movie_plots_processed.csv'
wiki_movie_plots_head.to_csv(output_path, index=False)

# Verify the saved file
print(wiki_movie_plots_head[['Plot', 'Processed_Plot']].head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wiki_movie_plots_head['Processed_Plot'] = wiki_movie_plots_head['Plot'].apply(preprocess_text)


                                                Plot  \
0  The film opens with two bandits breaking into ...   
1  The film is about a family who move to the sub...   
2  The film features a train traveling through th...   
3  Boone's daughter befriends an Indian maiden as...   
4  Before heading out to a baseball game at a nea...   

                                      Processed_Plot  
0  film open two bandit breaking railroad telegra...  
1  film family move suburb hoping quiet life thin...  
2  film feature train traveling rockies hold crea...  
3  boone daughter befriends indian maiden boone c...  
4  heading baseball game nearby ballpark sport fa...  
                                                Plot  \
0  The film opens with two bandits breaking into ...   
1  The film is about a family who move to the sub...   
2  The film features a train traveling through th...   
3  Boone's daughter befriends an Indian maiden as...   
4  Before heading out to a baseball game at a nea... 

Convert Text to Numerical Features using TF-IDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

new_data = pd.read_csv('wiki_movie_plots_processed.csv')

# Filter for American movies
new_data = new_data[new_data['Origin/Ethnicity'] == 'American']

# Identify the top 10 most frequent genres
top_10_genres = new_data['Genre'].value_counts().nlargest(10).index

new_data = new_data[new_data['Genre'].isin(top_10_genres)]

output_path = '/Users/zhenglongluo/Dropbox/MBS/Module 3/Text Analytics for Business/Project/Data/wiki_movie_plots_short.csv'
new_data.to_csv(output_path, index=False)

X = new_data['Processed_Plot']
y = new_data['Genre']


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=30)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report_result)

Accuracy: 0.5399750156152405
Classification Report:
                 precision    recall  f1-score   support

         action       0.85      0.07      0.12       166
      adventure       0.00      0.00      0.00       127
         comedy       0.63      0.55      0.59       957
          crime       0.00      0.00      0.00        82
    crime drama       0.33      0.01      0.02       114
          drama       0.46      0.86      0.60      1054
         horror       0.71      0.53      0.61       219
romantic comedy       0.00      0.00      0.00        93
       thriller       0.00      0.00      0.00       137
        western       0.85      0.68      0.75       253

       accuracy                           0.54      3202
      macro avg       0.38      0.27      0.27      3202
   weighted avg       0.51      0.54      0.48      3202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
