**Movie genre**


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
train_path= "train_data.txt"
train_data = pd.read_csv(train_path, sep=":::", names=["TITLE", "GENRE", "DESCRIPTION"], engine="python")
train_data

Unnamed: 0,TITLE,GENRE,DESCRIPTION
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),drama,Before he was known internationally as a mart...
...,...,...,...
17713,Volunteers? (2009),documentary,The film portrays all sides involved in the i...
17714,Doolittle Raiders (2006),documentary,"""Doolittle Raiders"" is an animated documentar..."
17715,Sabbath (2008),horror,When the last soul enters heaven and the firs...
17716,Kadaikkan (2010),drama,KADAIKKAN is a Tamil language drama about a y...


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17717 entries, 1 to 17717
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   TITLE        17717 non-null  object
 1   GENRE        17717 non-null  object
 2   DESCRIPTION  17717 non-null  object
dtypes: object(3)
memory usage: 553.7+ KB


In [None]:
train_data.describe()

Unnamed: 0,TITLE,GENRE,DESCRIPTION
count,17717,17717,17717
unique,17717,27,17695
top,Edgar's Lunch (1998),drama,A retired professor has returned to his estat...
freq,1,4497,6


In [None]:
train_data.isnull().sum()

TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64

In [None]:
#Access Movie Summaries:
# Extract the "DESCRIPTION" column containing movie summaries
movie_summaries = train_data["DESCRIPTION"]


In [None]:
#Convert to Strings (if necessary):
# If summaries are not already strings, convert them
if not pd.api.types.is_string_dtype(movie_summaries):
    movie_summaries = movie_summaries.astype(str)


In [None]:
# Lowercase:
# Convert all text to lowercase
movie_summaries = movie_summaries.str.lower()


In [None]:
#Remove Punctuation:
import re

# Define a pattern to match non-word characters
punctuation_pattern = r"[^\w\s]"

# Remove punctuation from each summary
movie_summaries = movie_summaries.apply(lambda text: re.sub(punctuation_pattern, "", text))


In [None]:
#Remove Stop Words (optional):
import nltk

# Download stop words if not already available
nltk.download('stopwords')

# Get a list of English stop words
stop_words = nltk.corpus.stopwords.words('english')

# Remove stop words from each summary
movie_summaries = movie_summaries.apply(lambda text: " ".join([word for word in text.split() if word not in stop_words]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Verify Preprocessing:
# Print a few examples of the preprocessed summaries
print(movie_summaries.head())


1    lr brane loves life car apartment job especial...
2    spain march 1964 quico naughty child three bel...
3    one year life albin family shepherds north tra...
4    father died hasnt spoken brother 10 years seri...
5    known internationally martial arts superstar b...
Name: DESCRIPTION, dtype: object


In [None]:
# Feature Extraction:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5, stop_words="english")
vectorizer.fit(movie_summaries)
features = vectorizer.transform(movie_summaries)


In [None]:
# Assuming `movie_summaries` contains preprocessed text

# Feature extraction with initial parameters
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5, stop_words="english")
features = vectorizer.fit_transform(movie_summaries)

# Experiment with different parameters
vectorizer_2 = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), min_df=10, stop_words=None)
features_2 = vectorizer_2.fit_transform(movie_summaries)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Assuming you have loaded your data into a pandas DataFrame named `train_data`
genres = train_data["GENRE"]  # Extract genre labels

# Now proceed with splitting the data
X_train, X_test, y_train, y_test = train_test_split(features, genres, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)  # Increase iterations, adjust as needed
model.fit(X_train, y_train)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model.fit(X_train_scaled, y_train)

model = LogisticRegression(solver='liblinear')  # Or try 'sag'
model.fit(X_train, y_train)


In [None]:

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-score: {f1:.4f}")



Accuracy: 0.5240
F1-score: 0.4533


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

with open("test_data.txt", "r") as file:
    new_descriptions = file.readlines()

def tokenize_text(text):
    return text.split()  # Simple tokenization, adjust as needed

tokenized_descriptions = [tokenize_text(desc) for desc in new_descriptions]

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

filtered_descriptions = []
for desc in tokenized_descriptions:
    filtered_desc = [word for word in desc if word not in stop_words]
    filtered_descriptions.append(filtered_desc)

from nltk.stem import PorterStemmer  # Or choose a different stemmer/lemmatizer

stemmer = PorterStemmer()

stemmed_descriptions = []
for desc in filtered_descriptions:
    stemmed_desc = [stemmer.stem(word) for word in desc]
    stemmed_descriptions.append(stemmed_desc)



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the training descriptions
with open("train_data.txt", "r") as file:
    training_descriptions = file.readlines()

# (Same preprocessing steps as before: tokenization, stop word removal, stemming/lemmatization)

# Fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words="english")  # Adjust parameters as needed
vectorizer.fit(training_descriptions)


In [None]:
# Load the new descriptions
with open("test_data.txt", "r") as file:
    new_descriptions = file.readlines()

# Apply the same preprocessing steps used for training data on the new descriptions

# Transform the new descriptions into TF-IDF features
new_features = vectorizer.transform(new_descriptions)
new_features.shape

(52344, 181332)

In [None]:

# Scale the features
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)

# Re-fit vectorizer on training data (ensure consistency)
vectorizer.fit(movie_summaries)  # Ensure this is the same data used for model training and scaling

# Transform both training and new data using re-fitted vectorizer
features = vectorizer.transform(movie_summaries)
new_features = vectorizer.transform(new_descriptions)

# Verify feature dimensions
print(features.shape)  # Should be (n_samples_training, 5000)
print(new_features.shape)  # Should also be (n_samples_new, 5000)

# If dimensions match, proceed with scaling and prediction
if features.shape[1] == new_features.shape[1] == 5000:
    new_features_scaled = scaler.transform(new_features)  # Use existing scaler
    predicted_genres = model.predict(new_features_scaled)

    # ... (proceed with prediction analysis and results)

else:
    print("Dimensionality mismatch still exists. Investigate further.")



(17717, 78156)
(52344, 78156)
Dimensionality mismatch still exists. Investigate further.


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset
# Make sure 'test_data.txt' is in the same directory as your script or notebook
# Adjust the delimiter and column names as needed
test_data = pd.read_csv('test_data.txt', delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])

# Assuming you have a separate dataset for training (not provided)
# Load your training dataset in a similar manner and perform data cleaning if needed

# For simplicity, let's generate random genre labels for the training data
np.random.seed(42)
train_data = pd.DataFrame({
    'ID': np.arange(1, len(test_data) + 1),
    'TITLE': np.random.choice(['Movie A', 'Movie B', 'Movie C'], len(test_data)),
    'DESCRIPTION': np.random.choice(['Description 1', 'Description 2', 'Description 3'], len(test_data)),
    'GENRE': np.random.choice(['Action', 'Drama', 'Comedy'], len(test_data))
})

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data[['TITLE', 'DESCRIPTION']], train_data['GENRE'], test_size=0.2, random_state=42)

# Extract features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['TITLE'] + ' ' + X_train['DESCRIPTION'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['TITLE'] + ' ' + X_test['DESCRIPTION'])

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_tfidf)

# Ensure that the predicted data has the same set of classes as the testing data
unique_classes = np.union1d(y_test, y_pred)
y_test = y_test[np.isin(y_test, unique_classes)]
y_pred = y_pred[np.isin(y_pred, unique_classes)]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)  # Add zero_division parameter

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)



Accuracy: 0.33

Classification Report:
               precision    recall  f1-score   support

      Action       0.33      1.00      0.49      3542
      Comedy       1.00      0.00      0.00      3665
       Drama       1.00      0.00      0.00      3633

    accuracy                           0.33     10840
   macro avg       0.78      0.33      0.16     10840
weighted avg       0.78      0.33      0.16     10840

