In [1]:
import gc

# Free up memory
gc.collect()


194

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import joblib

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Stopwords set
stop_words = set(stopwords.words('english'))

# Text preprocessing function with re module
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation and numbers
        text = re.sub(r'[^\w\s]', '', text)
        # Tokenize (split by whitespace)
        words = word_tokenize(text)
        # Remove stopwords
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)
    else:
        return ''  # Return empty string if input is not a string

# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
solution_df = pd.read_csv('solution.csv')

# Fill missing values
train_df.fillna('', inplace=True)
test_df.fillna('', inplace=True)
solution_df.fillna('', inplace=True)
min_length = 25  
keywords = ['drama','documentary','comedy','short','horror','thriller','action','western','reality-tv','family','adventure','music','romance','sci-fi','adult','crime','animation','sport','talk-show','fantasy','mystery','musical','biography','history','game-show',        
'news','war' ]
def filter_descriptions(df):
    # Remove empty or null descriptions
    df = df[df['DESCRIPTION'].notnull() & (df['DESCRIPTION'].str.strip() != '')]
    # Filter based on length
    df = df[df['DESCRIPTION'].str.len() >= min_length]
    # Filter based on keywords
    keyword_pattern = '|'.join(keywords)
    df = df[df['DESCRIPTION'].str.contains(keyword_pattern, case=False, na=False)]
    return df
    # Print the shape of the dataframes to check the preprocessing
print(f"Training data shape after preprocessing: {train_df.shape}")
print(f"Testing data shape after preprocessing: {test_df.shape}")
print(f"Solution data shape after preprocessing: {solution_df.shape}")
# Preprocess the descriptions
train_df['DESCRIPTION'] = train_df['DESCRIPTION'].astype(str).apply(preprocess_text)
test_df['DESCRIPTION'] = test_df['DESCRIPTION'].astype(str).apply(preprocess_text)
solution_df['DESCRIPTION'] = solution_df['DESCRIPTION'].astype(str).apply(preprocess_text)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))  # Reduced max_features
X_train_text = tfidf_vectorizer.fit_transform(train_df['DESCRIPTION'])
X_test_text = tfidf_vectorizer.transform(test_df['DESCRIPTION'])
X_solution_text = tfidf_vectorizer.transform(solution_df['DESCRIPTION'])

# Combine TF-IDF features with other features
X_train = X_train_text
X_test = X_test_text
X_solution = X_solution_text

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['GENRE'])
y_test = le.transform(solution_df['GENRE'])



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lekhanaal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lekhanaal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training data shape after preprocessing: (54214, 4)
Testing data shape after preprocessing: (54200, 3)
Solution data shape after preprocessing: (54200, 4)


In [3]:
from sklearn.decomposition import TruncatedSVD
from imblearn.over_sampling import SMOTE
from scipy.sparse import csr_matrix

# Convert the feature matrix to a sparse format if it isn't already
X_train_sparse = csr_matrix(X_train_text)

# Apply Truncated SVD for dimensionality reduction
svd = TruncatedSVD(n_components=500, random_state=42)  # Adjust n_components as needed
X_train_reduced = svd.fit_transform(X_train_sparse)
# Apply SME on the reduced data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_reduced, y_train)


In [4]:
from sklearn.decomposition import IncrementalPCA

ipca = IncrementalPCA(n_components=250, batch_size=1000)

# Fit on the training data and then transform it
X_train_res_reduced = ipca.fit_transform(X_train_res)

# Now, transform the test and solution data
X_test_reduced = ipca.fit_transform(X_test_text)           # No need for .toarray()
X_solution_reduced = ipca.fit_transform(X_solution_text)   # No need for .toarray()


MemoryError: Unable to allocate 701. MiB for an array with shape (367551, 250) and data type float64

In [None]:
# Function to evaluate models
def evaluate_model(model, X_test, y_true):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1



In [None]:
import numpy as np
X_train_res = np.clip(X_train_res, a_min=0, a_max=None)
X_test_reduced = np.clip(X_test_reduced, a_min=0, a_max=None)
X_solution_reduced = np.clip(X_solution_reduced, a_min=0, a_max=None)
# Train and evaluate Naive Bayes classifier
param_grid_nb = {'alpha': [0.1, 0.5, 1.0]}
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5, scoring='accuracy')
grid_search_nb.fit(X_train_res, y_train_res)
nb_model = grid_search_nb.best_estimator_
joblib.dump(nb_model, 'naive_bayes_model.pkl')
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(nb_model, X_test_reduced, y_test)


In [None]:
import numpy as np

# Check for negative values in the training data
print(np.min(X_train_reduced))  # Should be >= 0 for MultinomialNB


In [None]:
from sklearn.naive_bayes import GaussianNB
import joblib

# Train and evaluate GaussianNB classifier
nb_model = GaussianNB()
nb_model.fit(X_train_res, y_train_res)
joblib.dump(nb_model, 'gaussian_nb_model.pkl')

# Evaluate the model
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(nb_model, X_test_reduced, y_test)
print(f"Accuracy: {nb_accuracy}")
print(f"Precision: {nb_precision}")
print(f"Recall: {nb_recall}")
print(f"F1 Score: {nb_f1}")


In [None]:
# Train and evaluate Naive Bayes classifier
param_grid_nb = {'alpha': [0.1, 0.5, 1.0]}
grid_search_nb = GridSearchCV(GaussianNB(), param_grid_nb, cv=5, scoring='accuracy')
grid_search_nb.fit(X_train_res, y_train_res)
nb_model = grid_search_nb.best_estimator_
joblib.dump(nb_model, 'naive_bayes_model.pkl')
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(nb_model, X_test_reduced, y_test)


In [None]:
# Train and evaluate Logistic Regression classifier
param_grid_lr = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
grid_search_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_res, y_train_res)
lr_model = grid_search_lr.best_estimator_
joblib.dump(lr_model, 'logistic_regression_model.pkl')
lr_accuracy, lr_precision, lr_recall, lr_f1 = evaluate_model(lr_model, X_test_reduced, y_test)


In [None]:
# Train and evaluate SVM classifier
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train_res, y_train_res)
svm_model = grid_search_svm.best_estimator_
joblib.dump(svm_model, 'svm_model.pkl')
svm_accuracy, svm_precision, svm_recall, svm_f1 = evaluate_model(svm_model, X_test_reduced, y_test)

In [None]:
# Save the TF-IDF vectorizer and label encoder
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(le, 'label_encoder.pkl')

# Print evaluation results
print(f"Naive Bayes - Accuracy: {nb_accuracy}, Precision: {nb_precision}, Recall: {nb_recall}, F1-score: {nb_f1}")
print(f"Logistic Regression - Accuracy: {lr_accuracy}, Precision: {lr_precision}, Recall: {lr_recall}, F1-score: {lr_f1}")
print(f"SVM - Accuracy: {svm_accuracy}, Precision: {svm_precision}, Recall: {svm_recall}, F1-score: {svm_f1}")

# Predict genre for a new plot summary
def predict_genre(model, plot_summary):
    plot_summary_preprocessed = preprocess_text(plot_summary)
    plot_summary_tfidf = tfidf_vectorizer.transform([plot_summary_preprocessed])
    plot_summary_reduced = pca.transform(plot_summary_tfidf.toarray())
    predicted_genre = model.predict(plot_summary_reduced)
    return le.inverse_transform(predicted_genre)[0]

# Example usage
new_plot_summary = "A young boy discovers he has magical powers and attends a school for wizards."
predicted_genre_nb = predict_genre(nb_model, new_plot_summary)
predicted_genre_lr = predict_genre(lr_model, new_plot_summary)
predicted_genre_svm = predict_genre(svm_model, new_plot_summary)

print(f"Predicted genre (Naive Bayes): {predicted_genre_nb}")
print(f"Predicted genre (Logistic Regression): {predicted_genre_lr}")
print(f"Predicted genre (SVM): {predicted_genre_svm}")