In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd

# Replace the file ID with your own
file_id = '1GIc7WrNMMSwJ4T-INWztqZu9QQt06_6q'
link = f'https://drive.google.com/uc?id={file_id}'
df = pd.read_csv(link)

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from spacy.lang.de.stop_words import STOP_WORDS



# Load the CSV dataset into a pandas DataFrame


# Preprocess the text data

tfidf = TfidfVectorizer(stop_words=list(STOP_WORDS))
X = df['Plot']
y = df['Genre']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Fit a TF-IDF vectorizer on the training set and transform both the training and testing sets
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a logistic regression model on the TF-IDF transformed training set
model = LogisticRegression(max_iter = 5000)
model.fit(X_train_tfidf, y_train)

# Evaluate the accuracy of the model on the TF-IDF transformed testing set
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Use the trained model to predict the genre of new movies
new_movie = ["A man discovers that his wife is secretly a spy and gets caught up in a dangerous international conspiracy."]
new_movie_tfidf = tfidf.transform(new_movie)
predicted_genre = model.predict(new_movie_tfidf)
print("Predicted genre:", predicted_genre)


Accuracy: 0.569883290634785
Predicted genre: ['comedy']


In [12]:
from sklearn.metrics import confusion_matrix

# Get the predicted labels for the test set
y_pred = model.predict(X_test_tfidf)

# Create the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[  56    2   27    0  122    3    0    3    4    2    1]
 [   3   16   19    0   61    1    0    0    2    0    3]
 [   4    0  595    0  257    8    0    6    3    1    2]
 [   7    0   28    9   65    2    0    0    0    2    1]
 [   9    1  156    2 1003    5    0    5    4    5    3]
 [   1    0   24    0   63  136    0    0    6    2    1]
 [   0    0   51    0   36    1    2    1    0    0    2]
 [   2    0   33    0  128    0    0   22    0    0    0]
 [   3    0   20    0   34   20    0    0   51    0    0]
 [   5    0   38    1  124   11    0    1    0   13    0]
 [   1    0   17    0   55    1    0    0    0    0   99]]


In [13]:
from sklearn.metrics import classification_report

# Get the predicted labels for the test set
y_pred = model.predict(X_test_tfidf)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)

Classification Report:
                 precision    recall  f1-score   support

         action       0.62      0.25      0.36       220
      adventure       0.84      0.15      0.26       105
         comedy       0.59      0.68      0.63       876
          crime       0.75      0.08      0.14       114
          drama       0.51      0.84      0.64      1193
         horror       0.72      0.58      0.65       233
        musical       1.00      0.02      0.04        93
        romance       0.58      0.12      0.20       185
science fiction       0.73      0.40      0.52       128
       thriller       0.52      0.07      0.12       193
        western       0.88      0.57      0.69       173

       accuracy                           0.57      3513
      macro avg       0.70      0.34      0.39      3513
   weighted avg       0.61      0.57      0.52      3513



In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Train a Multinomial Naive Bayes classifier on the TF-IDF features
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Use the classifier to predict the genre labels for the test set
y_pred_nb = nb_classifier.predict(X_test_tfidf)

# Compute the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred_nb)
print("Accuracy:", accuracy)

Accuracy: 0.36151437517791063


In [15]:
from sklearn.metrics import classification_report

# Get the predicted labels for the test set
y_pred_nb = nb_classifier.predict(X_test_tfidf)

# Generate the classification report
report = classification_report(y_test, y_pred_nb)

# Print the classification report
print("Classification Report:")
print(report)

Classification Report:
                 precision    recall  f1-score   support

         action       0.00      0.00      0.00       220
      adventure       0.00      0.00      0.00       105
         comedy       0.81      0.09      0.17       876
          crime       0.00      0.00      0.00       114
          drama       0.35      1.00      0.52      1193
         horror       0.00      0.00      0.00       233
        musical       0.00      0.00      0.00        93
        romance       0.00      0.00      0.00       185
science fiction       0.00      0.00      0.00       128
       thriller       0.00      0.00      0.00       193
        western       0.00      0.00      0.00       173

       accuracy                           0.36      3513
      macro avg       0.11      0.10      0.06      3513
   weighted avg       0.32      0.36      0.22      3513



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [17]:
# Train an SVM classifier on the TF-IDF features
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)

In [18]:
# Use the classifier to predict the genre labels for the test set
y_pred_svc = svm_classifier.predict(X_test_tfidf)

In [19]:
# Compute the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred_svc)
print("Accuracy:", accuracy)

Accuracy: 0.5482493595217762


In [20]:
from sklearn.metrics import classification_report

# Get the predicted labels for the test set
y_pred_svc = svm_classifier.predict(X_test_tfidf)

# Generate the classification report
report = classification_report(y_test, y_pred_svc)

# Print the classification report
print("Classification Report:")
print(report)

Classification Report:
                 precision    recall  f1-score   support

         action       0.75      0.14      0.23       220
      adventure       0.91      0.10      0.17       105
         comedy       0.62      0.63      0.62       876
          crime       0.50      0.01      0.02       114
          drama       0.47      0.87      0.61      1193
         horror       0.75      0.55      0.64       233
        musical       0.00      0.00      0.00        93
        romance       0.59      0.07      0.13       185
science fiction       0.82      0.40      0.54       128
       thriller       1.00      0.03      0.05       193
        western       0.89      0.56      0.69       173

       accuracy                           0.55      3513
      macro avg       0.66      0.30      0.34      3513
   weighted avg       0.61      0.55      0.49      3513



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
import pickle
pickle.dump(model, open('Logistic_NLP.sav', 'wb'))
pickle.dump(nb_classifier, open('Naive_NLP.sav', 'wb'))
pickle.dump(svm_classifier, open('SVC_NLP.sav', 'wb'))