In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re

In [2]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text

In [3]:
# Load Dataset
train_raw = 'train_data.csv'
test_raw = 'test_data.csv'
test_solution_raw = 'test_data_solution.csv'

In [4]:
# Parse the raw data into custom formatted data
train_data = pd.read_csv(train_raw, delimiter = ':::', engine = 'python', names=['ID', 'Title', 'genre', 'plot'])
test_data = pd.read_csv(test_raw, delimiter = ':::', engine = 'python', names=['ID', 'Title', 'plot'])
test_solution =  pd.read_csv(test_solution_raw, delimiter = ':::', engine = 'python', names=['ID', 'genre'])

In [5]:
# Print column names to verify
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)
print("Test Solution Data Columns:", test_solution.columns)

Train Data Columns: Index(['ID', 'Title', 'genre', 'plot'], dtype='object')
Test Data Columns: Index(['ID', 'Title', 'plot'], dtype='object')
Test Solution Data Columns: Index(['ID', 'genre'], dtype='object')


In [6]:
# Preprocess the text data
train_data['plot'] = train_data['plot'].apply(preprocess_text)
test_data['plot'] = test_data['plot'].apply(preprocess_text)

In [7]:
# Extracting the true labels for test data
y_test = test_solution['genre'].astype('category')

In [8]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_data['plot'])
X_test_tfidf = vectorizer.transform(test_data['plot'])

In [9]:
# Extracting the labels for training data
y_train = test_solution['genre'].astype('category')

In [10]:
# Train and evaluate Logistic Regression
lr_clf = LogisticRegression(max_iter=200)
lr_clf.fit(X_train_tfidf, y_train)
y_pred_lr = lr_clf.predict(X_test_tfidf)

In [11]:
print("Dataset - Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Dataset - Logistic Regression:
Accuracy: 0.0006666666666666666


In [12]:
# Train and evaluate Naive Bayes
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)
y_pred_nb = nb_clf.predict(X_test_tfidf)

In [13]:
print("Dataset - Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))

Dataset - Naive Bayes:
Accuracy: 0.0006666666666666666
