In [None]:
import pandas as pd

# Load training data
train_df = pd.read_csv('train.csv')

# Load test data
test_df = pd.read_csv('test.csv')

# Load solution data (for evaluation purposes)
solution_df = pd.read_csv('solution.csv')


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text.lower())
    # Removing punctuation and stop words
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

# Preprocess the descriptions in the training data
train_df['DESCRIPTION'] = train_df['DESCRIPTION'].apply(preprocess_text)

# Preprocess the descriptions in the test data
test_df['DESCRIPTION'] = test_df['DESCRIPTION'].apply(preprocess_text)


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_df['DESCRIPTION'])
X_test = tfidf_vectorizer.transform(test_df['DESCRIPTION'])

# Extract labels
y_train = train_df['GENRE']
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Train Logistic Regression classifier
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Train SVM classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate the model
def evaluate_model(model, X_test, y_true):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Load the actual genres for the test data
y_test = solution_df['GENRE']

# Evaluate Naive Bayes
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(nb_model, X_test, y_test)

# Evaluate Logistic Regression
lr_accuracy, lr_precision, lr_recall, lr_f1 = evaluate_model(lr_model, X_test, y_test)

# Evaluate SVM
svm_accuracy, svm_precision, svm_recall, svm_f1 = evaluate_model(svm_model, X_test, y_test)

print(f"Naive Bayes - Accuracy: {nb_accuracy}, Precision: {nb_precision}, Recall: {nb_recall}, F1-score: {nb_f1}")
print(f"Logistic Regression - Accuracy: {lr_accuracy}, Precision: {lr_precision}, Recall: {lr_recall}, F1-score: {lr_f1}")
print(f"SVM - Accuracy: {svm_accuracy}, Precision: {svm_precision}, Recall: {svm_recall}, F1-score: {svm_f1}")
# Function to predict genre
def predict_genre(model, plot_summary):
    plot_summary_preprocessed = preprocess_text(plot_summary)
    plot_summary_tfidf = tfidf_vectorizer.transform([plot_summary_preprocessed])
    predicted_genre = model.predict(plot_summary_tfidf)
    return predicted_genre[0]

# Example usage
new_plot_summary = "A young boy discovers he has magical powers and attends a school for wizards."
predicted_genre_nb = predict_genre(nb_model, new_plot_summary)
predicted_genre_lr = predict_genre(lr_model, new_plot_summary)
predicted_genre_svm = predict_genre(svm_model, new_plot_summary)

print(f"Predicted genre (Naive Bayes): {predicted_genre_nb}")
print(f"Predicted genre (Logistic Regression): {predicted_genre_lr}")
print(f"Predicted genre (SVM): {predicted_genre_svm}")

