<a href="https://colab.research.google.com/github/Manya123-max/CodSoftML/blob/main/Movie_Gener.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path to the dataset directory
data_dir = '/content/drive/MyDrive/Genre Classification Dataset'

In [None]:
# Read all files in the directory
files = os.listdir(data_dir)
print("Files in the directory:", files)

Files in the directory: ['test_data.txt', 'train_data.txt', 'description.txt', 'test_data_solution.txt', '.ipynb_checkpoints', 'classification_report.txt', 'test_predictions.csv']


In [None]:
# Load file contents into a dictionary
file_contents = {}
for file_name in files:
    file_path = os.path.join(data_dir, file_name)
    # Check if the item is a file before opening it
    if os.path.isfile(file_path):  # Add this condition
        with open(file_path, 'r', encoding='utf-8') as file:
            file_contents[file_name] = file.read()

In [None]:
test_data_text = file_contents.get('test_data.txt', '')
train_data_text = file_contents.get('train_data.txt', '')
test_data_solution_text= file_contents.get('train_data_solution.txt', '')

In [None]:
# Function to parse training dataset
def parse_train_data(data_text):
    data = []
    lines = data_text.strip().split("\n")
    for line in lines:
        parts = line.split(" ::: ")
        if len(parts) == 4:  # Ensure there are exactly 4 parts (ID, Title, Genre, Description)
            movie_id, title, genre, description = parts
            data.append({"id": movie_id, "title": title, "genre": genre, "plot": description})
    return pd.DataFrame(data)

In [None]:
# Function to parse test dataset
def parse_test_data(data_text):
    data = []
    lines = data_text.strip().split("\n")
    for line in lines:
        parts = line.split(" ::: ")
        if len(parts) == 3:  # Ensure there are exactly 3 parts (ID, Title, Description)
            movie_id, title, description = parts
            data.append({"id": movie_id, "title": title, "plot": description})
    return pd.DataFrame(data)

In [None]:
# Parse train and test data
train_df = parse_train_data(train_data_text)
test_df = parse_test_data(test_data_text)

In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# Apply preprocessing to plots
train_df["processed_plot"] = train_df["plot"].apply(preprocess_text)
test_df["processed_plot"] = test_df["plot"].apply(preprocess_text)

In [None]:
# Check for empty plots
train_df = train_df[train_df["processed_plot"].str.strip() != ""]
test_df = test_df[test_df["processed_plot"].str.strip() != ""]

In [None]:
# Ensure data is loaded and valid
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Train data shape: (173777, 5)
Test data shape: (54200, 4)


In [None]:
# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
X_train = tfidf_vectorizer.fit_transform(train_df["processed_plot"])
X_test = tfidf_vectorizer.transform(test_df["processed_plot"])

In [None]:
# Encode genres as numerical labels
label_mapping = {genre: idx for idx, genre in enumerate(train_df["genre"].unique())}
train_df["genre_label"] = train_df["genre"].map(label_mapping)
y_train = train_df["genre_label"]

In [None]:
# Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = nb_model.predict(X_test)

In [None]:
# Add predictions to the test DataFrame
inverse_label_mapping = {idx: genre for genre, idx in label_mapping.items()}
test_df["predicted_genre"] = [inverse_label_mapping[label] for label in y_pred]

In [None]:
# Display predictions
print("Test Data with Predicted Genres:")
print(test_df[["id", "title", "predicted_genre"]])

Test Data with Predicted Genres:
          id                           title predicted_genre
0          1            Edgar's Lunch (1998)           drama
1          2        La guerra de papá (1977)           drama
2          3     Off the Beaten Track (2010)     documentary
3          4          Meu Amigo Hindu (2015)           drama
4          5               Er nu zhai (1955)           drama
...      ...                             ...             ...
54195  54196  "Tales of Light & Dark" (2013)           drama
54196  54197     Der letzte Mohikaner (1965)           drama
54197  54198             Oliver Twink (2007)          comedy
54198  54199               Slipstream (1973)          comedy
54199  54200       Curitiba Zero Grau (2010)     documentary

[54200 rows x 3 columns]


In [None]:
# Optionally, save the test data with predictions to a file
test_df.to_csv(os.path.join(data_dir, "test_predictions.csv"), index=False)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the training data into training and validation sets (80% training, 20% validation)
X = X_train  # TF-IDF transformed text
y = train_df["genre_label"]  # Genre labels

X_train_split, X_val, y_train_split, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_split, y_train_split)

# Make predictions on the validation set
y_val_pred = classifier.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.5606226263091265


In [23]:
# Print classification report
print("Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=list(label_mapping.keys()), zero_division=0))


# Save classification report to file
with open(os.path.join(data_dir, "classification_report.txt"), "w") as f:
    f.write(f"Validation Accuracy: {accuracy:.4f}\n")
    f.write(classification_report(y_val, y_val_pred, target_names=list(label_mapping.keys()), zero_division=0))

Classification Report:
              precision    recall  f1-score   support

       drama       0.49      0.84      0.61      8648
    thriller       0.68      0.07      0.13      1029
       adult       0.64      0.18      0.28       388
 documentary       0.59      0.89      0.71      8559
      comedy       0.58      0.46      0.51      4791
       crime       0.80      0.03      0.05       299
  reality-tv       0.68      0.09      0.16       569
      horror       0.77      0.48      0.59      1427
       sport       0.85      0.23      0.36       317
   animation       1.00      0.03      0.05       310
      action       0.65      0.20      0.30       818
     fantasy       0.00      0.00      0.00       193
       short       0.63      0.14      0.23      3184
      sci-fi       0.81      0.17      0.28       423
       music       0.76      0.40      0.52       484
   adventure       0.79      0.11      0.19       499
   talk-show       0.83      0.08      0.15       238
    