<a href="https://colab.research.google.com/github/NasrunSR/CODSOFT-/blob/main/Movie_Genre_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Movie Genre Classification

1. Importing required libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score

2. Data Loading and Preprocessing

In [5]:
# Load train data
train_path = '/content/train_data.txt'
train_data = pd.read_csv(train_path, sep=' ::: ', engine='python', names=['ID','Title', 'Genre', 'Description'])

# Load test data
test_path = '/content/test_data.txt'
test_data = pd.read_csv(test_path, sep=' ::: ', engine='python', names=['ID', 'Title', 'Description'])

# Load test data solution
test_solution_path = '/content/test_data_solution.txt'
test_data_solution = pd.read_csv(test_solution_path, sep=' ::: ', engine='python', names=['ID', 'Title', 'Genre','Description'])

# Split the data for testing and training according to features
X_train = train_data['Description']
y_train = train_data['Genre']
X_test = test_data['Description']
y_test = test_data_solution['Genre']

In [6]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000000000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Oversampling to handle class imbalance
oversampler = RandomOverSampler(random_state=42)
X_train_tfidf_resampled, y_train_resampled = oversampler.fit_resample(X_train_tfidf, y_train)

3. Training Machine Learning Algorithm
- Naive bayes algorithm

In [7]:
# Train a Naive Bayes model (MultinomialNB)
nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train_tfidf_resampled, y_train_resampled)

4. Predict
- By passing data through trained model

In [8]:
# Make predictions
predictions = nb_model.predict(X_test_tfidf)


5. Output Predicted Data

In [9]:
# Output predictions
output_df = pd.DataFrame({'ID': test_data['ID'], 'Predicted_Genre': predictions})
output_df.to_csv('predicted_genres.csv', index=False)

# Print predicted genres with their respective IDs
print(output_df)


          ID Predicted_Genre
0          1          comedy
1          2           drama
2          3     documentary
3          4           drama
4          5           drama
...      ...             ...
54195  54196          horror
54196  54197         western
54197  54198           drama
54198  54199          horror
54199  54200     documentary

[54200 rows x 2 columns]


6. Evaluate model

In [10]:
# Compute accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.5567158671586716
