## Follow the similar logic as Movie Classifer Demo done in the class.

- Tokenize text using spacy.
- Download the Word2Vec Model
- Vectorize all words in each review.
- Calculate mean vector of the reviews
- Train a Neural Network for classification
- Test the trained neural network with few examples.

In [21]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import euclidean, cosine
from google.colab import userdata
hf_access_token = userdata.get('HF_ACCESS_TOKEN')

In [22]:
from huggingface_hub import login

login(hf_access_token)


In [23]:
#read the csv file
df = pd.read_csv('assignment_1.4.csv')

print(df.head())

genres = df['genre']
descriptions = df['description']
print(genres[:5], descriptions[:5])
print(len(genres), len(descriptions))


       genre                                        description
0    horror    When six friends fly off on a weekend getaway...
1    horror    The story is about a young girl who was touch...
2   romance    A young woman named Anna has always longed fo...
3    horror    A London couple moves to a large country hous...
4   romance    The younger of two sibling executives has usu...
0      horror 
1      horror 
2     romance 
3      horror 
4     romance 
Name: genre, dtype: object 0     When six friends fly off on a weekend getaway...
1     The story is about a young girl who was touch...
2     A young woman named Anna has always longed fo...
3     A London couple moves to a large country hous...
4     The younger of two sibling executives has usu...
Name: description, dtype: object
1340 1340


In [24]:
# initiate vectorizer object
vectorizer = TfidfVectorizer()

# fit the vectorizer on the description column
vectorizer.fit(descriptions)
descriptions_tf_idf_vectors = vectorizer.transform(descriptions)
tf_idf_indexes = vectorizer.get_feature_names_out()

print(vectorizer.vocabulary_)
print(len(vectorizer.vocabulary_))

print(descriptions_tf_idf_vectors.toarray().shape)
df = df.assign(descriptions_tf_idf_vectors = list(descriptions_tf_idf_vectors.toarray()))
df.head()


14560
(1340, 14560)


Unnamed: 0,genre,description,descriptions_tf_idf_vectors
0,horror,When six friends fly off on a weekend getaway...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,horror,The story is about a young girl who was touch...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,romance,A young woman named Anna has always longed fo...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,horror,A London couple moves to a large country hous...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,romance,The younger of two sibling executives has usu...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Mean Vectors of Genres

In [25]:
# claculating mean vectors for each genre
genre_mean_vectors = {}
for genre in genres.unique():
    genre_mean_vectors[genre] = np.mean(df[df['genre'] == genre]['descriptions_tf_idf_vectors'].to_list(), axis=0)
print(genre_mean_vectors)

{' horror ': array([0.00017337, 0.00084984, 0.00048822, ..., 0.00018645, 0.        ,
       0.        ]), ' romance ': array([0.        , 0.00062289, 0.00109939, ..., 0.        , 0.00020427,
       0.00011158])}


In [26]:
#cosine similarity and eucledian distance beteween the mean vectors
cosine_similarity = 1 - cosine(genre_mean_vectors[" horror "], genre_mean_vectors[" romance "])
euclidean_distance = euclidean(genre_mean_vectors[" horror "], genre_mean_vectors[" romance "])
print("Eucledian distance between mean vectors of genres: ",euclidean_distance)
print("Cosine similarity between mean vectors of genres ",cosine_similarity)

Eucledian distance between mean vectors of genres:  0.12026956778144632
Cosine similarity between mean vectors of genres  0.8785933152539191


In [27]:
#!python -m spacy download en_core_web_lg

In [28]:
# imports for Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from tqdm import tqdm
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
# Load spacy model
nlp = spacy.load("en_core_web_lg")

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
#generate the sentiment labels
df["genre_label"] = df["genre"].map({" romance ": 0, " horror ": 1})
print(df.head())

       genre                                        description  \
0    horror    When six friends fly off on a weekend getaway...   
1    horror    The story is about a young girl who was touch...   
2   romance    A young woman named Anna has always longed fo...   
3    horror    A London couple moves to a large country hous...   
4   romance    The younger of two sibling executives has usu...   

                         descriptions_tf_idf_vectors  genre_label  
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...            1  
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...            1  
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...            0  
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...            1  
4  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...            0  


In [30]:
repo_id = "NathaNn1111/word2vec-google-news-negative-300-bin"
filename = "GoogleNews-vectors-negative300.bin"
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [31]:
def clean_data(desc):
    words = stopwords.words('english')
    lower = " ".join([w for w in desc.lower().split() if not w in words])
    punct = ''.join(ch for ch in lower if ch not in punctuation)
    wordnet_lemmatizer = WordNetLemmatizer()

    word_tokens = nltk.word_tokenize(punct)
    lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]

    words_joined = " ".join(lemmatized_words)

    return words_joined

In [32]:


# Function to create mean vector for a review
def description_to_vector(description):
    tokens = [token.text.lower() for token in nlp(description) if token.is_alpha]
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec.vector_size)

In [33]:
# Generate mean vectors for all reviews
tqdm.pandas()
df['vector'] = df['description'].progress_apply(description_to_vector)

100%|██████████| 1340/1340 [00:42<00:00, 31.53it/s]


In [34]:
df.columns

Index(['genre', 'description', 'descriptions_tf_idf_vectors', 'genre_label',
       'vector'],
      dtype='object')

In [35]:
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [36]:
#Train the model for genre classification

X = np.stack(df['vector'].values)
y = df['genre_label'].values

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Simple neural network model

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [37]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



Epoch 1/200
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 107ms/step - accuracy: 0.5699 - loss: 0.6831 - val_accuracy: 0.9070 - val_loss: 0.6175
Epoch 2/200
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.7919 - loss: 0.5981 - val_accuracy: 0.9302 - val_loss: 0.4400
Epoch 3/200
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8991 - loss: 0.4157 - val_accuracy: 0.9395 - val_loss: 0.2605
Epoch 4/200
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9150 - loss: 0.2746 - val_accuracy: 0.9349 - val_loss: 0.1880
Epoch 5/200
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9389 - loss: 0.2266 - val_accuracy: 0.9442 - val_loss: 0.1659
Epoch 6/200
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9230 - loss: 0.2232 - val_accuracy: 0.9442 - val_loss: 0.1490
Epoch 7/200
[1m27/27[0m [32

In [38]:
# Function to preprocess and predict genre for a new description
def predict_genre(description, model, word2vec, nlp):
    # Tokenize and create a mean vector for the description
    tokens = [token.text.lower() for token in nlp(description) if token.is_alpha]
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if vectors:
        mean_vector = np.mean(vectors, axis=0)
    else:
        mean_vector = np.zeros(word2vec.vector_size)

    # Predict genre
    prediction = model.predict(mean_vector.reshape(1, -1))[0][0]
    genre = "horror" if prediction > 0.5 else "romance"
    confidence = prediction if genre == "horror" else 1 - prediction
    return genre, confidence

# Example reviews for inference
# these samples were removed from the original .csv file
example_desc = [
    " In a small college in North Carolina, only a select few students are left to take mid terms. But, when a killer strikes, it could be everyone's final exam.",
    " The story evolves between Keung and Bao who are in their 10th years marriage. It followed by relationship with their own families and friends. They had numerous disagreements and conflicts which leading to divorce state. Are they willing to give each other a chance to be together ?",
    """Between the Cup and the Lip" is a beautiful polish turn century drama, which is basen on Maria Rodziewiczowna's novel. The action takes place in Berlin, where we meet Count Wentzel, who falls for a beautiful unknown lady. Since then he tries to get close to her, but as we know, a lot of things can happen 'between the cup and the lip'.""",
    " An exploration of what happens when human curiosity and animal rage collide. This low-budget, high-concept thriller presents a different kind of antagonist, and the same well-meaning-but-doomed kids you love to cheer for as they meet their grisly ends.",
]

# Run inference on example reviews
for description in example_desc:
    genre, confidence = predict_genre(description, model, word2vec, nlp)
    print(f"Description: {description}\nPredicted genre: {genre} (Confidence: {confidence:.2f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
Description:  In a small college in North Carolina, only a select few students are left to take mid terms. But, when a killer strikes, it could be everyone's final exam.
Predicted genre: horror (Confidence: 0.99)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Description:  The story evolves between Keung and Bao who are in their 10th years marriage. It followed by relationship with their own families and friends. They had numerous disagreements and conflicts which leading to divorce state. Are they willing to give each other a chance to be together ?
Predicted genre: romance (Confidence: 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Description: Between the Cup and the Lip" is a beautiful polish turn century drama, which is basen on Maria Rodziewiczowna's novel. The action takes place in Berlin, where we meet Count Wentzel, who falls for a beautiful unknow

In [40]:
# In a small college in North Carolina, only a select few students are left to take mid terms. But, when a killer strikes, it could be everyone's final exam.

# The story evolves between Keung and Bao who are in their 10th years marriage. It followed by relationship with their own families and friends. They had numerous disagreements and conflicts which leading to divorce state. Are they willing to give each other a chance to be together ?

# "Between the Cup and the Lip" is a beautiful polish turn century drama, which is basen on Maria Rodziewiczowna's novel. The action takes place in Berlin, where we meet Count Wentzel, who falls for a beautiful unknown lady. Since then he tries to get close to her, but as we know, a lot of things can happen 'between the cup and the lip'.

# An exploration of what happens when human curiosity and animal rage collide. This low-budget, high-concept thriller presents a different kind of antagonist, and the same well-meaning-but-doomed kids you love to cheer for as they meet their grisly ends.

# labels = horror,romance,romance,horror

In [42]:
model.save('/content/my_model.h5')

