In [1]:
import pandas as pd

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [5]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text.lower())
    # Removing punctuation and stop words
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lekhanaal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lekhanaal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
   
# Load training data
train_df = pd.read_csv('train.csv')

# Load test data
test_df = pd.read_csv('test.csv')

# Load solution data (for evaluation purposes)
solution_df = pd.read_csv('solution.csv')

# Preprocess the descriptions in the training data
train_df['DESCRIPTION'] = train_df['DESCRIPTION'].apply(preprocess_text)

# Preprocess the descriptions in the test data
test_df['DESCRIPTION'] = test_df['DESCRIPTION'].apply(preprocess_text)


In [9]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_df['DESCRIPTION'])
X_test = tfidf_vectorizer.transform(test_df['DESCRIPTION'])

In [11]:
# Extract labels
y_train = train_df['GENRE']

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib

In [15]:
# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
joblib.dump(nb_model, 'naive_bayes_model.pkl')

['naive_bayes_model.pkl']

In [17]:
# Train Logistic Regression classifier
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
joblib.dump(lr_model, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']

In [19]:
# Train SVM classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)
joblib.dump(svm_model, 'svm_model.pkl')

['svm_model.pkl']

In [21]:
# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [25]:
# Function to evaluate the model
def evaluate_model(model, X_test, y_true):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1

In [27]:
# Load the actual genres for the test data
y_test = solution_df['GENRE']

In [35]:
# Function to predict genre
def predict_genre(model, plot_summary):
    plot_summary_preprocessed = preprocess_text(plot_summary)
    plot_summary_tfidf = tfidf_vectorizer.transform([plot_summary_preprocessed])
    predicted_genre = model.predict(plot_summary_tfidf)
    return predicted_genre[0]

In [37]:
# Example usage
new_plot_summary = "A young boy discovers he has magical powers and attends a school for wizards."
predicted_genre_nb = predict_genre(nb_model, new_plot_summary)
predicted_genre_lr = predict_genre(lr_model, new_plot_summary)
predicted_genre_svm = predict_genre(svm_model, new_plot_summary)

print(f"Predicted genre (Naive Bayes): {predicted_genre_nb}")
print(f"Predicted genre (Logistic Regression): {predicted_genre_lr}")
print(f"Predicted genre (SVM): {predicted_genre_svm}")

Predicted genre (Naive Bayes): drama
Predicted genre (Logistic Regression): drama
Predicted genre (SVM): drama


In [39]:
# Evaluate Naive Bayes
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(nb_model, X_test, y_test)

# Evaluate Logistic Regression
lr_accuracy, lr_precision, lr_recall, lr_f1 = evaluate_model(lr_model, X_test, y_test)

# Evaluate SVM
svm_accuracy, svm_precision, svm_recall, svm_f1 = evaluate_model(svm_model, X_test, y_test)

print(f"Naive Bayes - Accuracy: {nb_accuracy}, Precision: {nb_precision}, Recall: {nb_recall}, F1-score: {nb_f1}")
print(f"Logistic Regression - Accuracy: {lr_accuracy}, Precision: {lr_precision}, Recall: {lr_recall}, F1-score: {lr_f1}")
print(f"SVM - Accuracy: {svm_accuracy}, Precision: {svm_precision}, Recall: {svm_recall}, F1-score: {svm_f1}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Naive Bayes - Accuracy: 0.5222509225092251, Precision: 0.5155476428118921, Recall: 0.5222509225092251, F1-score: 0.44524651031760154
Logistic Regression - Accuracy: 0.5843726937269372, Precision: 0.5638711604519377, Recall: 0.5843726937269372, F1-score: 0.5453500540039219
SVM - Accuracy: 0.5810147601476015, Precision: 0.565162579416963, Recall: 0.5810147601476015, F1-score: 0.5328358105306702


In [43]:
print(test_df.head())

   ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                         DESCRIPTION  
0  brane loves life car apartment job especially ...  
1  spain march 1964 quico naughty child three bel...  
2  one year life albin family shepherds north tra...  
3  father died spoken brother 10 years serious ca...  
4  known internationally martial arts superstar b...  


In [45]:
print(train_df.head())

   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  listening conversation doctor parents oscar le...  
1  brother sister past incestuous relationship cu...  
2  bus empties students field trip museum natural...  
3  help unemployed father make ends meet edith tw...  
4  film title refers bodies ground zero also stat...  
