In [1]:
import pandas as pd
import re
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
def get_sentence_embedding(sentence, word2vec_model):
    words = re.findall(r'\w+', sentence.lower())
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * 100

In [3]:
def train_model():
    train_data = pd.read_csv("train_data.txt", sep=':::', names=['Title', 'Genre', 'Description'], engine='python')
    test_data = pd.read_csv("test_data.txt", sep=':::', names=['Title', 'Description'], engine='python')
    combined_data = pd.concat([train_data, test_data], ignore_index=True)
    data = combined_data.copy()
    data.dropna(inplace=True)
    corpus = data["Description"].apply(lambda x: re.findall(r'\w+', x.lower()))
    word2vec_model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

    data["embeddings"] = data["Description"].apply(lambda x: get_sentence_embedding(x, word2vec_model))
    
    label_encoder = LabelEncoder()
    data["genre_encoded"] = label_encoder.fit_transform(data["Genre"])
    
    y = data["genre_encoded"]
    X = pd.DataFrame(data["embeddings"].to_list())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    return model, label_encoder, word2vec_model

In [4]:
def predict_genre(model, label_encoder, word2vec_model, new_descriptions):
    new_embeddings = [get_sentence_embedding(desc, word2vec_model) for desc in new_descriptions]
    new_X = pd.DataFrame(new_embeddings)
    new_predictions = model.predict(new_X)
    predicted_genres = label_encoder.inverse_transform(new_predictions)

    return predicted_genres

In [5]:
trained_model, genre_label_encoder, trained_word2vec_model = train_model()

In [15]:
new_descriptions = ["When a privileged student filmmaker teams up with an acclaimed documentary filmmaker they both learn a valuable life lesson about the human condition.",
                    "It is a modern-day adaptation of William Shakespeare's Othello. The film's title is derived from a Malay idiom meaning web of deceit or conspiracy, which is a major theme in the plot of the film. The plot remains faithful to the source material, with the cast all taking their Shakespearian counterparts either in name, character or both.",
                    "A tight team of three young law student Abby, dropout daredevil Jay and wheelchair bound tech whiz Milton - hire themselves out to people who want to get into places they're not supposed to be in."]

predicted_genres = predict_genre(trained_model, genre_label_encoder, trained_word2vec_model, new_descriptions)

for desc, genre in zip(new_descriptions, predicted_genres):
    print(f"Description: {desc}")
    print(f"Predicted Genre: {genre}\n")

Description: When a privileged student filmmaker teams up with an acclaimed documentary filmmaker they both learn a valuable life lesson about the human condition.
Predicted Genre:  documentary 

Description: It is a modern-day adaptation of William Shakespeare's Othello. The film's title is derived from a Malay idiom meaning web of deceit or conspiracy, which is a major theme in the plot of the film. The plot remains faithful to the source material, with the cast all taking their Shakespearian counterparts either in name, character or both.
Predicted Genre:  short 

Description: A tight team of three young law student Abby, dropout daredevil Jay and wheelchair bound tech whiz Milton - hire themselves out to people who want to get into places they're not supposed to be in.
Predicted Genre:  comedy 

