# This is the notebook where w2v model will be trained :)

### importing modules

In [120]:
import re
import ast
import nltk
import gensim
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [121]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### loading and preparing data

In [122]:
anidf = pd.read_csv(r"C:\Users\HP\Desktop\Anirec\Anirec\data\preprocessed_ani_data.csv")

In [123]:
anidf[['synopsis', 'genres', 'demographic']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14740 entries, 0 to 14739
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   synopsis     14740 non-null  object
 1   genres       14740 non-null  object
 2   demographic  5115 non-null   object
dtypes: object(3)
memory usage: 345.6+ KB


In [124]:
anidf[['synopsis', 'genres', 'demographic']].head()

Unnamed: 0,synopsis,genres,demographic
0,Yabuki Joe is left downhearted and hopeless af...,"['Historical', 'Sports', 'Josei', 'Drama', 'Ad...",Shounen
1,"Ghostly, primordial beings known as Mushi cont...","['Supernatural', 'Slice of Life', 'Action', 'A...",Seinen
2,Following the conclusion of the large-scale co...,"['Historical', 'Sports', 'Action', 'Drama', 'S...",Seinen
3,Young Thorfinn grew up listening to the storie...,"['Fantasy', 'Historical', 'Adventure', 'Action...",Seinen
4,"Crime is timeless. By the year 2071, humanity ...","['Seinen', 'Action', 'Award Winning', 'Science...",


In [125]:
def convert_to_listg(x):
    try:
        return ', '.join(ast.literal_eval(x))
    except (ValueError, SyntaxError, TypeError):
        return np.nan

anidf['genres'] = anidf['genres'].apply(convert_to_listg)

In [126]:
def convert_to_list_d(x):
    if type(x) != float:
        return ', ' + ''.join(x)
    else:
        return ' '
    
anidf['demographic'] = anidf['demographic'].apply(convert_to_list_d)

In [127]:
anidf['demographic'] = anidf['demographic'].fillna(' ').astype(str)

In [128]:
anidf[['genres', 'demographic']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14740 entries, 0 to 14739
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   genres       14740 non-null  object
 1   demographic  14740 non-null  object
dtypes: object(2)
memory usage: 230.4+ KB


In [129]:
anidf[['genres', 'demographic', 'synopsis']].head()

Unnamed: 0,genres,demographic,synopsis
0,"Historical, Sports, Josei, Drama, Adventure",", Shounen",Yabuki Joe is left downhearted and hopeless af...
1,"Supernatural, Slice of Life, Action, Adventure...",", Seinen","Ghostly, primordial beings known as Mushi cont..."
2,"Historical, Sports, Action, Drama, Shounen, Ma...",", Seinen",Following the conclusion of the large-scale co...
3,"Fantasy, Historical, Adventure, Action, Drama,...",", Seinen",Young Thorfinn grew up listening to the storie...
4,"Seinen, Action, Award Winning, Science Fiction...",,"Crime is timeless. By the year 2071, humanity ..."


In [130]:
def combine_text(row):
    synopsis = row['synopsis'] if pd.notna(row['synopsis']) else ''
    genres = row['genres'] if pd.notna(row['genres']) else ''
    demographic = row['demographic'] if pd.notna(row['demographic']) else ''
    return f"{synopsis} {genres} {demographic}."

In [131]:
anidf['combined_text'] = anidf.apply(combine_text, axis=1)

In [132]:
anidf['combined_text']

0        Yabuki Joe is left downhearted and hopeless af...
1        Ghostly, primordial beings known as Mushi cont...
2        Following the conclusion of the large-scale co...
3        Young Thorfinn grew up listening to the storie...
4        Crime is timeless. By the year 2071, humanity ...
                               ...                        
14735    Queen Mandora ruled Carcosa Wonderland, becaus...
14736    Jing-Ju Cats is a series filled with comedy, m...
14737    The main character is a little sheep-looking m...
14738    Kenichi is a 1st year middle school student wh...
14739    70 million years ago dinosaurs ruled the Korea...
Name: combined_text, Length: 14740, dtype: object

In [133]:
anidf = anidf['combined_text']

In [134]:
anidf.sample(5)

14667    Short stories with no excitement, no punchline...
9707     Hiroshi Zenno is an average high school studen...
12637    An educational film about teamwork in Japan. I...
2967     Nagisa Aoi begins her new school life as a tra...
2295     Despite having seemingly quelled the war betwe...
Name: combined_text, dtype: object

In [135]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub('\d+', '', text)
    
    # Remove punctuation
    text = re.sub('[^\w\s]', '', text)
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    
    return words

In [136]:
corpus = anidf.apply(preprocess_text).tolist()

### builiding and training the word2vec embedding model

In [137]:
model = Word2Vec(
    window=6,
    workers=8,
    vector_size=150,
    min_count=2,
    sg=1,
    sample=0.004
)

In [138]:
model.build_vocab(corpus, progress_per=1000)

In [139]:
model.train(corpus, total_examples=len(corpus), epochs=10)

(6998980, 7246340)

**Mannually Testing the model**

In [140]:
model.wv.most_similar('pokemon')

[('zeraora', 0.7912029027938843),
 ('pikachu', 0.7626437544822693),
 ('meowth', 0.7567713856697083),
 ('piplup', 0.7548236846923828),
 ('pichu', 0.7404292225837708),
 ('poké', 0.7351683974266052),
 ('pokémon', 0.7280144691467285),
 ('wobbuffet', 0.7261454463005066),
 ('trainers', 0.7220701575279236),
 ('anpanman', 0.7193007469177246)]

In [141]:
model.wv.most_similar(positive=['pirate', 'straw', 'hats'], negative=['girl'])

[('luffy', 0.7926630973815918),
 ('pirates', 0.7568743228912354),
 ('usopp', 0.7533671259880066),
 ('hat', 0.7481371164321899),
 ('sanji', 0.7412232756614685),
 ('arlong', 0.7410892844200134),
 ('nami', 0.7227644324302673),
 ('zoro', 0.7186127305030823),
 ('sunken', 0.705796480178833),
 ('voyage', 0.7039783000946045)]

In [142]:
model.wv.doesnt_match(['flame', 'magic', 'bowl', 'book', 'mana'])

'bowl'

In [143]:
model.wv.save_word2vec_format(r"C:\Users\HP\Desktop\Anirec\Anirec\data\word2vec_model.bin", binary=True)