In [32]:
import re
import csv

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [40]:
# Input and output file paths
input_file = 'dataset.txt'
output_file = 'output.csv'

# Function to parse each line of text and extract details
def parse_line(line):
    pattern = r'(\d+) ::: (.+?) ::: (.+?) ::: (.+)'
    match = re.match(pattern, line)
    if match:
        return match.groups()
    return None

# Read the text file and parse each line
data = []
with open(input_file, 'r') as file:
    for line in file:
        parsed = parse_line(line.strip())
        if parsed:
            data.append(parsed)

# Write parsed data to CSV
with open(output_file, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write header
    csvwriter.writerow(['ID', 'Title', 'Genre', 'Description'])
    # Write data rows
    csvwriter.writerows(data)

In [44]:
#import data
data = pd.read_csv("output.csv")

# Preprocess the data
data = data.dropna(subset=['Genre', 'Description'])
label_encoder = LabelEncoder()
data['Genre_encoded'] = label_encoder.fit_transform(data['Genre'])

# Prepare the text descriptions and labels
X = data['Description']
y = data['Genre_encoded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Training
logistic_classifier = LogisticRegression(random_state=42, max_iter=1000)
logistic_classifier.fit(X_train_tfidf, y_train)

In [46]:
# Function to predict genre and its probability for a new description
def predict_genre(description):
    # Transform the new description into TF-IDF feature vectors using the trained TF-IDF vectorizer
    description_tfidf = tfidf_vectorizer.transform([description])

    # Predict the genre label for the new description
    prediction = logistic_classifier.predict(description_tfidf)

    # Predict the probabilities of each genre for the new description
    probabilities = logistic_classifier.predict_proba(description_tfidf)

    # Convert the predicted label back to its original genre category using the label encoder
    predicted_genre = label_encoder.inverse_transform(prediction)[0]

    # Get the probability of the predicted genre for the new description
    # The predicted genre's probability is obtained from the predicted label
    predicted_probability = probabilities[0][prediction][0]

    # Return the predicted genre and its probability
    return predicted_genre, predicted_probability

new_description = input("Enter the description: ")
predicted_genre, predicted_probability = predict_genre(new_description)
print(f"\nPredicted Genre: {predicted_genre}")
print(f"Probability: {predicted_probability:.2f}")

Enter the description: Piero arrives in Paris from Luino after having won a pool tournament with friends. In the train he meets the famous place Ramazzini and has so many incidents that culminated with his arrest and the confiscation by the commissioner Juvet. Released, find randomly hospitality at the madame Lenormand and know the great painter Valentine. Both women are tied up, as a wife and lover, Maurice, in prison for robbery. A day Piero wears a mistake and the coat of astrakhan, Maurice, and then...

Predicted Genre: drama
Probability: 0.58
