In [6]:
import pandas as pd
import pickle
import gzip
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your anime data
anime_data_path = 'csv files/anime_cleaned.csv'
anime_data = pd.read_csv(anime_data_path)

# Ensure anime_data includes an 'anime_id' column
if 'anime_id' not in anime_data.columns:
    raise ValueError("The anime_data DataFrame must include an 'anime_id' column.")

# Create a TF-IDF vectorizer and fit it to the anime genres
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(anime_data['genre'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save the content-based model and similarity matrix with gzip compression
with gzip.open('content_based_model.pkl.gz', 'wb') as f:
    pickle.dump((vectorizer, cosine_sim, anime_data), f)


In [7]:
import gzip
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def load_content_based_model(file_path):
    with gzip.open(file_path, 'rb') as f:
        vectorizer, cosine_sim, anime_data = pickle.load(f)
    return vectorizer, cosine_sim, anime_data

# Load the model
vectorizer, cosine_sim, anime_data = load_content_based_model('content_based_model.pkl.gz')

# Check if the data was loaded correctly
print(type(vectorizer))
print(cosine_sim.shape)
print(anime_data.head())


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
(12294, 12294)
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie       1.0    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV      64.0    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.25   
3                                   Sci-Fi, Thriller     TV      24.0    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV      51.0    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [8]:
def get_recommendations(user_animes, anime_data, cosine_sim, top_n=10, rating_weight=0.5):
    # Ensure user_animes are valid
    valid_animes = anime_data[anime_data['name'].isin(user_animes)]
    if valid_animes.empty:
        raise ValueError("None of the user-provided anime titles were found in the dataset.")
    
    # Initialize similarity scores
    similar_scores = pd.Series(0, index=anime_data.index)
    
    # Calculate similarity scores for user-provided animes
    for anime in user_animes:
        if anime in anime_data['name'].values:
            idx = anime_data.index[anime_data['name'] == anime].tolist()[0]
            similar_scores += pd.Series(cosine_sim[idx])
    
    # Add similarity scores and ratings to recommendations
    recommendations = anime_data.copy()
    recommendations['similarity'] = similar_scores
    recommendations['rating'] = recommendations['rating'].astype(float)
    
    # Compute combined score
    recommendations['combined_score'] = (recommendations['similarity'] * (1 - rating_weight) + 
                                          recommendations['rating'] * rating_weight)
    
    # Sort by combined score
    recommendations = recommendations.sort_values(by='combined_score', ascending=False)
    
    # Return top N recommendations including anime ID
    top_recommendations = recommendations.head(top_n)[['anime_id', 'name', 'rating']]
    return top_recommendations

# Example usage with loaded model
user_animes = ['Fullmetal Alchemist: Brotherhood', 'Steins;Gate', 'Kimi no Na wa.']
recommendations = get_recommendations(user_animes, anime_data, cosine_sim)
print(recommendations)


       anime_id                                               name  rating
0         32281                                     Kimi no Na wa.    9.37
1          5114                   Fullmetal Alchemist: Brotherhood    9.26
3          9253                                        Steins;Gate    9.17
10464     33662            Taka no Tsume 8: Yoshida-kun no X-Files   10.00
11        28851                                     Koe no Katachi    9.05
7           820                               Ginga Eiyuu Densetsu    9.11
45         4282                     Kara no Kyoukai 5: Mujun Rasen    8.68
15          199                      Sen to Chihiro no Kamikakushi    8.93
5         32935  Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...    9.15
10         4181                               Clannad: After Story    9.06


In [9]:
import gzip
import os

def split_file(input_file, output_dir, chunk_size_mb=24):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if the input file exists
    if not os.path.isfile(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")
    
    # Read the content of the original file
    with gzip.open(input_file, 'rb') as f:
        file_content = f.read()
    
    # Split the content into chunks
    chunk_size = chunk_size_mb * 1024 * 1024  # Convert MB to bytes
    num_chunks = (len(file_content) + chunk_size - 1) // chunk_size  # Calculate number of chunks

    for i in range(num_chunks):
        start = i * chunk_size
        end = min(start + chunk_size, len(file_content))
        chunk_data = file_content[start:end]
        
        chunk_file = os.path.join(output_dir, f'chunk_{i:03}.pkl.gz')
        with gzip.open(chunk_file, 'wb') as f:
            f.write(chunk_data)
        
        print(f'Saved {chunk_file} with size {len(chunk_data) / (1024 * 1024):.2f} MB')

# Example usage
input_file = 'content_based_model.pkl.gz'
output_dir = 'content_based_model_chunks'
split_file(input_file, output_dir)


Saved content_based_model_chunks\chunk_000.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_001.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_002.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_003.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_004.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_005.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_006.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_007.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_008.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_009.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_010.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_011.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_012.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\chunk_013.pkl.gz with size 24.00 MB
Saved content_based_model_chunks\c