Testing pulling data from MAL and formatting the data

In [124]:
import pandas as pd 
import pickle
from bs4 import BeautifulSoup
import numpy as np
import requests
from scipy.sparse import load_npz
import re
import ast
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [125]:
with open('Models/prediction_model.pkl', 'rb') as file:
    model = pickle.load(file)

with open('Models/tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

with open('Models/rank_encoder.pkl', 'rb') as file:
    rank_encoder = pickle.load(file)

with open('Models/popularity_encoder.pkl', 'rb') as file:
    popularity_encoder = pickle.load(file)

with open('Models/score_encoder.pkl', 'rb') as file:
    score_encoder = pickle.load(file)

Scrape individual web_ids from MAL

In [126]:
anime_id = '54857'

def scrape_all(anime_id):
    # Construct the URL using the anime ID
    url = f"https://myanimelist.net/anime/{anime_id}"
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve data. HTTP status code: {response.status_code}")
        return None
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the entire page content as plain text
    page_text = soup.get_text(separator="\n", strip=True)
    
    return page_text


anime_data = scrape_all(anime_id)


In [127]:
anime_list = anime_data.split('\n')

anime_dict = {}

In [128]:
#Get Column
title = anime_list[0].split(' - ')[0]
anime_list.pop(0)

anime_dict['title'] = title

In [129]:
#Get Type, Episodes, English Title, Source
def parse_list(param, anime_list):
    key = param.replace(':', '').strip().lower()
    for i in range(len(anime_list)):
        if anime_list[i].startswith(param):
            anime_dict[key] = anime_list[i + 1].strip() 
            break


In [130]:
parse_list('Type:', anime_list)
parse_list('Episodes:', anime_list)
parse_list('English:', anime_list)
parse_list('Source:', anime_list)



In [131]:
synopsis_list = []

found_index = -1

for i in range(1, len(anime_list)):
    if anime_list[i] == "Synopsis" and "Edit" in anime_list[i - 1]:
        found_index = i
        break

if found_index != -1:
    for i in range(found_index + 1, len(anime_list)):
        if (anime_list[i].startswith("[Written by") or 
            anime_list[i].startswith("Related Entries") or 
            anime_list[i].startswith("Background") or
            anime_list[i].startswith("Edit")):
            break
        
        synopsis_list.append(anime_list[i])

synopsis_list 
combined_synopsis = " ".join((synopsis_list))
anime_dict['synopsis'] = combined_synopsis

In [132]:
genres_types = ['Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love', 
                  'Comedy', 'Drama', 'Ecchi', 'Erotica', 'Fantasy', 'Girls Love', 
                  'Gourmet', 'Hentai', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 
                  'Slice of Life', 'Sports', 'Supernatural', 'Suspense']

found_genres = set()

for entry in anime_list:
    if entry in genres_types:
        found_genres.add(entry)

genres = ", ".join(found_genres)
anime_dict['genres'] = genres

In [133]:
top_studios = [
    "Toei Animation", "Sunrise", "J.C.Staff", "Madhouse", 
    "TMS Entertainment", "Production I.G", "Studio Deen", 
    "Pierrot", "OLM", "Shin-Ei Animation", "A-1 Pictures", 
    "Nippon Animation", "AIC", "DLE", "Tatsunoko Production", "Trigger"
]

found_studios = set()

for entry in anime_list:
    if entry in top_studios:
        found_studios.add(entry)

studios = ", ".join(found_studios)
anime_dict['studios'] = studios

In [134]:
top_producers = ["Aniplex", "TV Tokyo", "Lantis", "Movic", 
                 "AT-X", "Bandai Visual", "Pony Canyon", "Kadokawa", 
                 "Dentsu", "Fuji TV", "NHK", "Sotsu", "KlockWorx", "Kodansha", "Shueisha"]

found_producers = set()
for entry in anime_list:
    if entry in top_producers:
        found_producers.add(entry)

producers = ", ".join(found_producers)
anime_dict['producers'] = producers

In [135]:
if anime_dict.get('episodes') == 'Unknown':
    anime_dict['episodes'] = 12

In [136]:
anime_dict

{'title': 'Re:Zero kara Hajimeru Isekai Seikatsu 3rd Season',
 'type': 'TV',
 'episodes': '16',
 'english': 'Re:ZERO -Starting Life in Another World- Season 3',
 'source': 'Light novel',
 'synopsis': "One year after the events at the Sanctuary, Subaru Natsuki trains hard to better face future challenges. The peaceful days come to an end when Emilia receives an invitation to a meeting in the Watergate City of Priestella from none other than Anastasia Hoshin, one of her rivals in the royal selection. Considering the meeting's significance and the potential dangers Emilia could face, Subaru and his friends accompany her. However, as Subaru reconnects with old associates and companions in Priestella, new formidable foes emerge. Driven by fanatical motivations and engaging in ruthless methods to achieve their ambitions, the new enemy targets Emilia and threaten the very existence of the city. Rallying his allies, Subaru must give his all once more to stop their and nefarious goals from beco

### Now format the set so it can be used for training

In [137]:
df = pd.DataFrame([anime_dict])
df

Unnamed: 0,title,type,episodes,english,source,synopsis,genres,studios,producers
0,Re:Zero kara Hajimeru Isekai Seikatsu 3rd Season,TV,16,Re:ZERO -Starting Life in Another World- Season 3,Light novel,"One year after the events at the Sanctuary, Su...","Fantasy, Drama, Suspense",,"AT-X, Kadokawa"


In [138]:
X_test = df

## Lemmatize and vectorize

In [139]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

custom_words = {'and', 'the', 'is', 'a', 'to', 'it', 's', 'like', 'year'}
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in custom_words) + r')\b'

def is_capitalized(word):
    return word[0].isupper() and word.isalpha()

# Too computationally expensive, not used
def is_name(word, pos_tagged):
    for chunk in ne_chunk(pos_tagged):
        if isinstance(chunk, Tree):
            for leaf in chunk.leaves():
                if leaf[0] == word:
                    return True
    return False

def lemmatize_text(text):
    if pd.isnull(text): 
        return text
    words = word_tokenize(text)
    pos_tagged = pos_tag(words)

    lemmatized_words = [
        lemmatizer.lemmatize(word.lower()) for word in words
        if word.lower() not in stop_words and word.lower() not in custom_words and not is_capitalized(word)
    ]
    lemmatized_text = ' '.join(lemmatized_words)

    ## Futher clean anything lemmatization missed, remove spaces and characters
    cleaned_text = re.sub(r'[^\w\s]', '', lemmatized_text)
    cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

In [140]:
X_test['synopsis'] = X_test['synopsis'].apply(lemmatize_text)

In [141]:
synopsis_tfidf = vectorizer.transform(X_test['synopsis'])
tfidf_df = pd.DataFrame(synopsis_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.columns = ['tfidf_' + col for col in tfidf_df.columns]
X_test = pd.concat([X_test, tfidf_df], axis=1)

## Hot encode other columns

In [142]:
for producer in top_producers:
    X_test[f'producer_{producer.replace(" ", "_").lower()}'] = False

for index, row in X_test.iterrows():
    producers_in_row = row['producers']
    
    for producer in top_producers:
        column_name = f'producer_{producer.replace(" ", "_").lower()}'
        if producer in producers_in_row:
            X_test.at[index, column_name] = True

In [143]:
for studio in top_studios:
    X_test[f'studio_{studio.replace(" ", "_").lower()}'] = False

for index, row in X_test.iterrows():
    studios_in_row = [studios.strip() for studios in row['studios'].split(',')]
    
    for studio in top_studios:
        column_name = f'studio_{studio.replace(" ", "_").lower()}'
        if studio in studios_in_row:
            X_test.at[index, column_name] = True

In [144]:
for genre in genres_types:
    X_test[f'Genres_{genre.replace(" ", "_")}'] = False

for index, row in X_test.iterrows():
    genres_in_row = [genre.strip() for genre in row['genres'].split(',')]
    
    for genre in genres_types:
        column_name = f'Genres_{studio}'
        if genre in genres_in_row:
            X_test.at[index, column_name] = True

In [145]:
X_test['source'] = X_test['source'].replace('Unknown', np.nan)
X_test['source'] = X_test['source'].replace('Mixed media', np.nan)
X_test['source'] = X_test['source'].replace('Radio', np.nan)
X_test['source'] = X_test['source'].replace('Card game', 'Game')
X_test['source'] = X_test['source'].replace('Picture book', 'Book')
X_test['source'] = X_test['source'].replace('Other', np.nan)
X_test['source'] = X_test['source'].replace('Web manga', 'Manga')
X_test['source'] = X_test['source'].replace('4-koma manga', 'Manga')
X_test['source'] = X_test['source'].replace('Music', np.nan)
X_test['source'] = X_test['source'].replace('Web novel', 'Book')
X_test['source'] = X_test['source'].replace('Novel', 'Book')

In [146]:
source_columns = ['Source_Book', 'Source_Game', 'Source_Light novel', 'Source_Manga', 'Source_Original', 'Source_Visual novel']
type_columns = ['Types_Movie','Types_Music','Types_ONA','Types_OVA','Types_Special','Types_TV']

In [147]:
for col in source_columns:
    source_type = col.split('_')[-1]  
    X_test[col] = X_test['source'].apply(lambda x: True if isinstance(x, str) and source_type in x else False)

for col in type_columns:
    type_value = col.split('_')[-1]  
    X_test[col] = X_test['type'].apply(lambda x: True if isinstance(x, str) and type_value in x else False)

In [148]:
X_test.drop(columns = ['synopsis', 'source', 'genres', 'studios', 'producers'], inplace = True)

In [149]:
scaler = preprocessing.MinMaxScaler()
X_test[["episodes"]] = scaler.fit_transform(df[["episodes"]])

In [150]:
df1 = pd.read_csv('Data/training_set.csv')
X_test = X_test.reindex(columns=df1.columns, fill_value=0)

In [151]:
X_test.columns

Index(['anime_id', 'title', 'episodes', 'score', 'Popularity_category',
       'Rank_category', 'Genres_Action', 'Genres_Adventure',
       'Genres_Avant Garde', 'Genres_Award Winning',
       ...
       'producer_pony_canyon', 'producer_kadokawa', 'producer_dentsu',
       'producer_fuji_tv', 'producer_nhk', 'producer_sotsu',
       'producer_klockworx', 'producer_kodansha', 'producer_shueisha', 'year'],
      dtype='object', length=239)

In [152]:
X_test.drop(columns = ['title', 'anime_id', 'Popularity_category', 'Rank_category', 'score', 'popularity', 'rank', 'studios', 'year'], inplace=True)

In [153]:
y_pred = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step


In [154]:
y_pred_score = y_pred[0].argmax(axis=1)
y_pred_score = score_encoder.inverse_transform(y_pred_score)

y_pred_pop = y_pred[1].argmax(axis=1)
y_pred_pop = popularity_encoder.inverse_transform(y_pred_pop)

y_pred_rank = y_pred[2].argmax(axis=1)
y_pred_rank = rank_encoder.inverse_transform(y_pred_rank)

In [155]:
y_pred_score, y_pred_pop, y_pred_rank

(array(['8'], dtype=object),
 array(['Top 500'], dtype=object),
 array(['Top 500'], dtype=object))