In [9]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [53]:
def load_imdb_data():
    from keras.datasets import imdb
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data()

    word_index = imdb.get_word_index()
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

    train_reviews = [' '.join([reverse_word_index.get(i - 3, '?') for i in review]) for review in train_data]
    test_reviews = [' '.join([reverse_word_index.get(i - 3, '?') for i in review]) for review in test_data]

    train_df = pd.DataFrame({"review": train_reviews, "label": train_labels})
    test_df = pd.DataFrame({"review": test_reviews, "label": test_labels})

    return pd.concat([train_df, test_df]).reset_index(drop=True)


In [5]:
def get_movie_genres(movie_id):
    ia = IMDb()
    movie = ia.get_movie(movie_id)
    return movie.get('genres', [])

In [56]:
def label_data(row):
    keywords = {
        "funny": ["funny", "hilarious", "comedy", "humor", "laugh"],
        "scary": ["scary", "horror", "terrifying", "frightening", "creepy"],
        "bad": ["bad", "terrible", "awful", "poor", "waste"],
        "good": ["good", "great", "excellent", "amazing", "wonderful"],
        "dramatic": ["dramatic", "emotional", "heartbreaking", "moving", "touching"]
    }

    label_counts = {label: 0 for label in keywords.keys()}
    review_text = row["review"].lower()

    for label, words in keywords.items():
        if any(word in review_text for word in words):
            label_counts[label] += 1
    
    return max(label_counts, key=label_counts.get)

In [7]:
def filter_by_genre(reviews_df, specific_genre):
    filtered_data = reviews_df[reviews_df["movie_id"].apply(lambda x: specific_genre in get_movie_genres(x))]
    return filtered_data

In [67]:
imdb_data = load_imdb_data()

In [68]:
imdb_data['label'] = imdb_data.apply(label_data, axis=1)

In [80]:
imdb_data.drop_duplicates(inplace=True)

In [19]:
# specific_genre = "Horror"
# filtered_data = filter_by_genre(imdb_data, specific_genre)

In [20]:
# Label the data
# filtered_data["labels"] = filtered_data.apply(label_data, axis=1)

In [79]:
# Save the filtered and labeled data to a new CSV file
imdb_data.to_csv("filtered_labeled_data.csv", index=False)

In [81]:
train_df = pd.concat([imdb_data, pd.get_dummies(imdb_data['label'], prefix='label')], axis=1)

In [87]:
train_df.drop(columns=['label'], inplace=True)

In [94]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back into a string
    text = ' '.join(words)
    
    return text

In [97]:
train_df['review'] = train_df['review'].apply(preprocess_text)

In [98]:
train_df['review'][0]

'film brilliant casting location scenery story direction everyone really suited part played could imagine robert redford amazing actor director norman father came scottish island loved fact real connection film witty remark throughout film great brilliant much bought film soon released retail would recommend everyone watch fly fishing amazing really cried end sad know say cry film must good definitely also congratulation two little boy played part norman paul brilliant child often left praising list think star play grown big profile whole film child amazing praised done think whole story lovely true someone life shared u'

In [105]:
# Define the number of classes
num_classes = 5

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['review'])
sequences = tokenizer.texts_to_sequences(train_df['review'])

# Pad the sequences to have the same length
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert the labels to one-hot encoding
labels = pd.get_dummies(train_df[['label_bad', 'label_dramatic', 'label_funny', 'label_good', 'label_scary']])

# Split the data into training and testing sets
train_size = int(0.8 * len(train_df))
train_data = padded_sequences[:train_size]
train_labels = labels[:train_size].values
test_data = padded_sequences[train_size:]
test_labels = labels[train_size:].values

# Define the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32))
model.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_data, train_labels, validation_split=0.1, epochs=5, batch_size=128)

# Evaluate the model
loss, accuracy = model.evaluate(test_data, test_labels)
print("Test accuracy: {:.2f}%".format(accuracy * 100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 93.73%


In [110]:
from sklearn.metrics import classification_report

# Get the model's output probabilities for each class
probs = model.predict(test_data)

# Get the predicted class for each sample
pred_classes = np.argmax(probs, axis=1)

# Get the true class for each sample
true_classes = np.argmax(test_labels, axis=1)

# Get the classification report for each class
target_names = ['bad', 'dramatic', 'funny', 'good', 'scary']
print(classification_report(true_classes, pred_classes, target_names=target_names))


              precision    recall  f1-score   support

         bad       0.98      0.98      0.98      2187
    dramatic       0.00      0.00      0.00       139
       funny       0.89      0.99      0.94      4449
        good       0.98      0.84      0.90      2310
       scary       0.99      0.96      0.97       831

    accuracy                           0.94      9916
   macro avg       0.77      0.75      0.76      9916
weighted avg       0.93      0.94      0.93      9916



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [112]:
model2 = Sequential()
model2.add(Embedding(input_dim=10000, output_dim=32))
model2.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(units=num_classes, activation='softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the class weights
# Convert the labels to one-hot encoding
label_names = ['label_bad', 'label_dramatic', 'label_funny', 'label_good', 'label_scary']
labels = pd.get_dummies(train_df[label_names])
class_weights = {i: 1 for i in range(num_classes)}
class_weights[label_names.index('label_dramatic')] = 2
# Train the model with the class weights
model2.fit(train_data, train_labels, validation_split=0.1, epochs=5, batch_size=128, class_weight=class_weights)

# Evaluate the model
loss, accuracy = model2.evaluate(test_data, test_labels)
print("Test accuracy: {:.2f}%".format(accuracy * 100))

# Calculate the per-class metrics
probs = model2.predict(test_data)
pred_classes = np.argmax(probs, axis=1)
target_names = ['bad', 'dramatic', 'funny', 'good', 'scary']
print(classification_report(np.argmax(test_labels, axis=1), pred_classes, target_names=target_names))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 96.75%
              precision    recall  f1-score   support

         bad       0.99      0.98      0.98      2187
    dramatic       0.50      0.01      0.01       139
       funny       0.95      0.99      0.97      4449
        good       0.98      0.97      0.98      2310
       scary       0.97      0.97      0.97       831

    accuracy                           0.97      9916
   macro avg       0.88      0.78      0.78      9916
weighted avg       0.96      0.97      0.96      9916

