In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
train_data_path = 'train_data.csv'
test_data_path = 'test_data.csv'
test_solution_path = 'test_data_solution.csv'

# Read train data
train_data = pd.read_csv(train_data_path, delimiter=' ::: ', engine='python', names=['ID', 'Title', 'Genre', 'Description'])
# Read test data
test_data = pd.read_csv(test_data_path, delimiter=' ::: ', engine='python', names=['ID', 'Title', 'Description'])
# Read test solution
test_solution = pd.read_csv(test_solution_path, delimiter=' ::: ', engine='python', names=['ID', 'Genre'])

# Preprocess the data
X = train_data['Description']
y = train_data['Genre']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Choose a classifier
classifier_type = 'naive_bayes'  # Change to 'naive_bayes' or 'svm' as needed

if classifier_type == 'logistic_regression':
    classifier = LogisticRegression(max_iter=500, penalty='l2')  # Increased max_iter and added L2 regularization
elif classifier_type == 'naive_bayes':
    classifier = MultinomialNB()
elif classifier_type == 'svm':
    classifier = SVC()

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Predict on validation set
y_pred = classifier.predict(X_val_tfidf)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
print('Classification Report:')
print(classification_report(y_val, y_pred))

# Predict on test data
X_test = test_data['Description']
X_test_tfidf = tfidf_vectorizer.transform(X_test)
test_data['Predicted_Genre'] = classifier.predict(X_test_tfidf)

# Compare with test solution
merged_test_data = test_data.merge(test_solution, on='ID')
print(f'Accuracy on test data: {accuracy_score(merged_test_data["Genre"], merged_test_data["Predicted_Genre"])}')
print('Test Data Classification Report:')
# Extract unique genres as target names if they exist
target_names = merged_test_data["Genre"].unique()
if len(target_names) > 0:
    print(classification_report(merged_test_data["Genre"], merged_test_data["Predicted_Genre"], target_names=target_names))
else:
    print("No target names found.")


# Display the test data with predictions
print(test_data.head())


Accuracy: 0.5231946878170248
Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.58      0.08      0.14       263
       adult       0.88      0.06      0.12       112
   adventure       0.29      0.03      0.05       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.44      0.47      1443
       crime       0.00      0.00      0.00       107
 documentary       0.58      0.88      0.70      2659
       drama       0.46      0.83      0.59      2697
      family       1.00      0.01      0.01       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.15      0.26        40
     history       0.00      0.00      0.00        45
      horror       0.73      0.36      0.48       431
       music       0.77      0.12      0.20       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0.00        56
        news       0.00    

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
