In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Load data
def load_data(path):
    # read data from .txt file and assign names to columns
    data = pd.read_csv(path, sep=":::", names=["ID", "Title", "Genre", "Plot"], engine="python")
    # We aren't gonna need ID, so drop this column
    data.drop(columns=["ID"], inplace=True)
    return data

In [4]:
df = load_data("train_data.txt")

In [5]:
df.head()


Unnamed: 0,Title,Genre,Plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [6]:
vectorizer = TfidfVectorizer()

In [7]:
vectorizer.get_stop_words()

In [8]:
# Let's try seeing the tokens fetting only few plots 

plot = df['Plot'][0:2] # get first plot
type(plot)

tokens = vectorizer.fit_transform(plot)

In [9]:

# Let's see the stopwords vailable in nltk.corpus
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
def clean_plot(plot):
    # first lowercase all words
    plot = plot.lower()
    # omit numbers, symbols, and all, except alphabets
    plot = re.sub(r'[^a-zA-Z\s]+', '', plot).strip()
    # remove the stopwords
    stop_words = set(stopwords.words('english'))  # You need to define stopwords
    plot = [word for word in plot.split() if word.lower() not in stop_words]
    plot = ' '.join(plot)
    # return clean plot
    return plot
    

In [11]:

# Clean all the plots and the clean plot to the data frame
df['Clean Plot'] = df['Plot'].apply(clean_plot)
df.head()

Unnamed: 0,Title,Genre,Plot,Clean Plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening conversation doctor parents yearold ...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,bus empties students field trip museum natural...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,help unemployed father make ends meet edith tw...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,films title refers unrecovered bodies ground z...


In [12]:

# Let's again start with TF-IDF vectorizer
vectorizer = TfidfVectorizer()

In [13]:

# Learn vocabulary and idf, return document-term matrix
X_train = vectorizer.fit_transform(df['Clean Plot']) 

In [23]:
y_train = df['Genre']
print(y_train.shape)
print(y_train)

(54214,)
0               drama 
1            thriller 
2               adult 
3               drama 
4               drama 
             ...      
54209          comedy 
54210          horror 
54211     documentary 
54212          comedy 
54213         history 
Name: Genre, Length: 54214, dtype: object


In [16]:
test_data = load_data("test_data_solution.txt")

In [17]:
test_data.head()

Unnamed: 0,Title,Genre,Plot
0,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [18]:
test_data['Clean Plot'] = test_data['Plot'].apply(clean_plot)
test_data.head()

Unnamed: 0,Title,Genre,Plot,Clean Plot
0,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar...",lr brane loves life car apartment job especial...
1,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch...",spain march quico naughty child three belongin...
2,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...,one year life albin family shepherds north tra...
3,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi...",father died hasnt spoken brother years serious...
4,Er nu zhai (1955),drama,Before he was known internationally as a mart...,known internationally martial arts superstar b...


In [19]:

# Transform the test data
X_test = vectorizer.transform(test_data['Clean Plot'])
print(X_test.shape)

(54200, 146144)


In [20]:

y_test = test_data['Genre']

In [21]:
model = MultinomialNB()

In [24]:
model.fit(X_train, y_train)

In [26]:
# Let's test the model with test data 
y_pred = model.predict(X_test)

In [27]:

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy = {(accuracy*100):.2f}%")

Accuracy = 44.36%


In [28]:
print(classification_report(y_test, y_pred,  zero_division=1))

               precision    recall  f1-score   support

      action        1.00      0.00      0.00      1314
       adult        1.00      0.00      0.00       590
   adventure        1.00      0.00      0.00       775
   animation        1.00      0.00      0.00       498
   biography        1.00      0.00      0.00       264
      comedy        0.73      0.04      0.07      7446
       crime        1.00      0.00      0.00       505
 documentary        0.53      0.90      0.67     13096
       drama        0.38      0.88      0.53     13612
      family        1.00      0.00      0.00       783
     fantasy        1.00      0.00      0.00       322
   game-show        1.00      0.00      0.00       193
     history        1.00      0.00      0.00       243
      horror        1.00      0.00      0.00      2204
       music        1.00      0.00      0.00       731
     musical        1.00      0.00      0.00       276
     mystery        1.00      0.00      0.00       318
        n

# Conclusion:
Multinomial Naive Bayes predicts the Genre of movie with 44.36% accuracy.

When I ommitted the code which cleans the stop words, teh accuracy was found to be 44.27% which is not a very large chage.