# Problam Statement

# Create a machine learning model that can predict the genre of a movie based on its plot summary or other textual information. You can use techniques like TF-IDF or word embeddings with classifiers such as Naive Bayes, Logistic Regression, or Support VectorMachines.

In [1]:
# Lets start the code by importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

In [2]:
# read the files

In [3]:
genre_list = [ 'action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western' ]

In [4]:
genre_list

['action',
 'adult',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'family',
 'fantasy',
 'game-show',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'news',
 'reality-tv',
 'romance',
 'sci-fi',
 'short',
 'sport',
 'talk-show',
 'thriller',
 'war',
 'western']

In [5]:
# Define a fallback genre for movies which the model finds very hard to predict
fallback_genre = 'Unknown'

In [6]:
fallback_genre

'Unknown'

In [7]:
# Load the Training dataset from train_data.txt

In [8]:
train_data = pd.read_csv('train_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')

In [9]:
train_data

Unnamed: 0,SerialNumber,MOVIE_NAME,GENRE,MOVIE_PLOT
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   SerialNumber  54214 non-null  int64 
 1   MOVIE_NAME    54214 non-null  object
 2   GENRE         54214 non-null  object
 3   MOVIE_PLOT    54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [11]:
train_data.isnull()


Unnamed: 0,SerialNumber,MOVIE_NAME,GENRE,MOVIE_PLOT
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
54209,False,False,False,False
54210,False,False,False,False
54211,False,False,False,False
54212,False,False,False,False


In [12]:
train_data.shape

(54214, 4)

In [13]:
train_data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
54209    False
54210    False
54211    False
54212    False
54213    False
Length: 54214, dtype: bool

In [14]:
train_data.iloc[:].GENRE


0               drama 
1            thriller 
2               adult 
3               drama 
4               drama 
             ...      
54209          comedy 
54210          horror 
54211     documentary 
54212          comedy 
54213         history 
Name: GENRE, Length: 54214, dtype: object

In [15]:
# Data preprocessing for training data

In [16]:
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = [genre.split(', ') for genre in train_data['GENRE']]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

In [17]:
X_train

0         listening in to a conversation between his do...
1         a brother and sister with a past incestuous r...
2         as the bus empties the students for their fie...
3         to help their unemployed father make ends mee...
4         the film's title refers not only to the un-re...
                               ...                        
54209     this short-lived nbc live sitcom centered on ...
54210     the next generation of exploitation. the sist...
54211     ze bestaan echt, is a stand-up comedy about g...
54212     walter and vivian live in the country and hav...
54213     on labor day weekend, 1935, the most intense ...
Name: MOVIE_PLOT, Length: 54214, dtype: object

In [18]:
genre_labels

[[' drama '],
 [' thriller '],
 [' adult '],
 [' drama '],
 [' drama '],
 [' documentary '],
 [' comedy '],
 [' crime '],
 [' reality-tv '],
 [' horror '],
 [' documentary '],
 [' drama '],
 [' documentary '],
 [' thriller '],
 [' drama '],
 [' drama '],
 [' comedy '],
 [' documentary '],
 [' sport '],
 [' animation '],
 [' drama '],
 [' comedy '],
 [' comedy '],
 [' drama '],
 [' action '],
 [' fantasy '],
 [' short '],
 [' sci-fi '],
 [' thriller '],
 [' documentary '],
 [' horror '],
 [' documentary '],
 [' action '],
 [' documentary '],
 [' music '],
 [' comedy '],
 [' drama '],
 [' drama '],
 [' comedy '],
 [' comedy '],
 [' documentary '],
 [' comedy '],
 [' short '],
 [' drama '],
 [' adventure '],
 [' thriller '],
 [' documentary '],
 [' short '],
 [' drama '],
 [' thriller '],
 [' documentary '],
 [' documentary '],
 [' comedy '],
 [' documentary '],
 [' talk-show '],
 [' horror '],
 [' documentary '],
 [' horror '],
 [' comedy '],
 [' documentary '],
 [' drama '],
 [' comedy 

In [19]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features


In [22]:
tfidf_vectorizer

In [23]:
# Fit and transform the training data with progress bar
with tqdm(total=50, desc="Vectorizing Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

Vectorizing Training Data: 100%|██████████| 50/50 [00:06<00:00,  7.36it/s]


In [24]:
X_train

0         listening in to a conversation between his do...
1         a brother and sister with a past incestuous r...
2         as the bus empties the students for their fie...
3         to help their unemployed father make ends mee...
4         the film's title refers not only to the un-re...
                               ...                        
54209     this short-lived nbc live sitcom centered on ...
54210     the next generation of exploitation. the sist...
54211     ze bestaan echt, is a stand-up comedy about g...
54212     walter and vivian live in the country and hav...
54213     on labor day weekend, 1935, the most intense ...
Name: MOVIE_PLOT, Length: 54214, dtype: object

In [25]:
X_train_tfidf

<54214x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3073173 stored elements in Compressed Sparse Row format>

In [26]:
# Train a MultiOutput Naive Bayes classifier using the training data
with tqdm(total=50, desc="Training Model") as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(50)


Training Model: 100%|██████████| 50/50 [00:00<00:00, 54.60it/s]


In [27]:
multi_output_classifier

In [28]:
# Load your test dataset from test_data.txt

In [29]:
test_data = pd.read_csv('test_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')

In [30]:
test_data 

Unnamed: 0,SerialNumber,MOVIE_NAME,GENRE,MOVIE_PLOT
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...",
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi...",
4,5,Er nu zhai (1955),Before he was known internationally as a mart...,
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da...",
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...,
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...,
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard...",


In [31]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SerialNumber  54200 non-null  int64  
 1   MOVIE_NAME    54200 non-null  object 
 2   GENRE         54200 non-null  object 
 3   MOVIE_PLOT    0 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.7+ MB


In [32]:
test_data.isnull()

Unnamed: 0,SerialNumber,MOVIE_NAME,GENRE,MOVIE_PLOT
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
...,...,...,...,...
54195,False,False,False,True
54196,False,False,False,True
54197,False,False,False,True
54198,False,False,False,True


In [33]:
test_data.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
54195    False
54196    False
54197    False
54198    False
54199    False
Length: 54200, dtype: bool

In [34]:
# Data preprocessing for test data
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())


In [35]:
X_test

0        nan
1        nan
2        nan
3        nan
4        nan
        ... 
54195    nan
54196    nan
54197    nan
54198    nan
54199    nan
Name: MOVIE_PLOT, Length: 54200, dtype: object

In [36]:
# Transform the test data with progress bar
with tqdm(total=50, desc="Vectorizing Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(50)

Vectorizing Test Data: 100%|██████████| 50/50 [00:00<00:00, 271.79it/s]


In [37]:
tfidf_vectorizer

In [38]:
# Predict genres on the test data
with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)


Predicting on Test Data: 100%|██████████| 50/50 [00:00<00:00, 219.39it/s]


In [39]:
multi_output_classifier

In [40]:
X_test_tfidf

<54200x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [41]:
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [42]:
# Create a DataFrame for test data with movie names and predicted genres
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})


In [43]:
test_movie_names

0                   Edgar's Lunch (1998) 
1               La guerra de papá (1977) 
2            Off the Beaten Track (2010) 
3                 Meu Amigo Hindu (2015) 
4                      Er nu zhai (1955) 
                       ...               
54195     "Tales of Light & Dark" (2013) 
54196        Der letzte Mohikaner (1965) 
54197                Oliver Twink (2007) 
54198                  Slipstream (1973) 
54199          Curitiba Zero Grau (2010) 
Name: MOVIE_NAME, Length: 54200, dtype: object

In [44]:
test_results

Unnamed: 0,MOVIE_NAME,PREDICTED_GENRES
0,Edgar's Lunch (1998),()
1,La guerra de papá (1977),()
2,Off the Beaten Track (2010),()
3,Meu Amigo Hindu (2015),()
4,Er nu zhai (1955),()
...,...,...
54195,"""Tales of Light & Dark"" (2013)",()
54196,Der letzte Mohikaner (1965),()
54197,Oliver Twink (2007),()
54198,Slipstream (1973),()


In [45]:
# Replace empty unpredicted genres with the fallback genre
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)


In [46]:
test_results

Unnamed: 0,MOVIE_NAME,PREDICTED_GENRES
0,Edgar's Lunch (1998),[Unknown]
1,La guerra de papá (1977),[Unknown]
2,Off the Beaten Track (2010),[Unknown]
3,Meu Amigo Hindu (2015),[Unknown]
4,Er nu zhai (1955),[Unknown]
...,...,...
54195,"""Tales of Light & Dark"" (2013)",[Unknown]
54196,Der letzte Mohikaner (1965),[Unknown]
54197,Oliver Twink (2007),[Unknown]
54198,Slipstream (1973),[Unknown]


In [47]:
# Write the results to an output text file with proper formatting
with open("model_evaluation.txt", "w", encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        movie_name = row['MOVIE_NAME']
        genre_str = ', '.join(row['PREDICTED_GENRES'])
        output_file.write(f"{movie_name} ::: {genre_str}\n")


In [48]:
movie_name

' Curitiba Zero Grau (2010) '

In [49]:
# Calculate evaluation metrics using training labels (as a proxy)
y_train_pred = multi_output_classifier.predict(X_train_tfidf)

In [50]:
y_train_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [51]:
# Calculate evaluation metrics
from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')


In [52]:
accuracy

0.2773453351532814

In [53]:
precision

0.7218294823741519

In [54]:
recall

0.28062861991367544

In [55]:
f1

0.4041386088642733

In [56]:
# Append the evaluation metrics to the output file
with open("model_evaluation.txt", "a", encoding="utf-8") as output_file:
    output_file.write("\n\nModel Evaluation Metrics:\n")
    output_file.write(f"Accuracy: {accuracy * 100:.2f}%\n")
    output_file.write(f"Precision: {precision:.2f}\n")
    output_file.write(f"Recall: {recall:.2f}\n")
    output_file.write(f"F1-score: {f1:.2f}\n")

In [57]:
print("Model evaluation results and metrics have been saved to 'model_evaluation.txt'.")

Model evaluation results and metrics have been saved to 'model_evaluation.txt'.


In [58]:
# Note in this movie genre i obsrve that a large the data,the more accuracy of the model increses.