<a href="https://colab.research.google.com/github/Mehvish-33/Encryptix/blob/main/movie_genre_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the training data
train_data = pd.read_csv("/train_data.txt", delimiter="\t", header=None, names=['DATA'])
print(train_data.head())

# Extract relevant parts from the 'DATA' column
train_data[['ID', 'TITLE', 'GENRE', 'DESCRIPTION']] = train_data['DATA'].str.split(' ::: ', expand=True)
train_data.drop(columns=['DATA'], inplace=True)

# Print the cleaned train_data dataframe
print(train_data.head())

# Encode the genre labels
encoder = LabelEncoder()
train_data["GENRE"] = encoder.fit_transform(train_data["GENRE"])

# Split the data into features (X) and labels (Y)
X = train_data["DESCRIPTION"]
Y = train_data["GENRE"]

# Define a pipeline with TF-IDF vectorizer and MultinomialNB classifier
clf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])

# Train the model
clf.fit(X, Y)

# Load the test data
test_data = pd.read_csv("/test_data_solution.txt", delimiter="\t", header=None, names=['DATA'])
print(test_data.head())

# Extract relevant parts from the 'DATA' column in test data
test_data[['ID', 'TITLE', 'GENRE', 'DESCRIPTION']] = test_data['DATA'].str.split(' ::: ', expand=True)
test_data.drop(columns=['DATA'], inplace=True)

# Encode the genre labels in test data
test_data["GENRE"] = encoder.transform(test_data["GENRE"])

# Split the test data into features (X_test) and labels (Y_test)
X_test = test_data["DESCRIPTION"]
Y_test = test_data["GENRE"]

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(Y_test, y_pred, target_names=encoder.classes_))


                                                DATA
0  1 ::: Oscar et la dame rose (2009) ::: drama :...
1  2 ::: Cupid (1997) ::: thriller ::: A brother ...
2  3 ::: Young, Wild and Wonderful (1980) ::: adu...
3  4 ::: The Secret Sin (1915) ::: drama ::: To h...
4  5 ::: The Unrecovered (2007) ::: drama ::: The...
  ID                             TITLE     GENRE  \
0  1      Oscar et la dame rose (2009)     drama   
1  2                      Cupid (1997)  thriller   
2  3  Young, Wild and Wonderful (1980)     adult   
3  4             The Secret Sin (1915)     drama   
4  5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
                                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
