# 1. Data Preprocessing

In [102]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\sai\OneDrive\Desktop\anime.csv")

# Handle missing values
df.fillna(df.select_dtypes(include=[np.number]).mean(), inplace=True)

# Explore the dataset
print(df.head())
print(df.info())
print(df.describe())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  

# Task 2: Feature Extraction

In [104]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical features into numerical representations
le = LabelEncoder()
df['genre'] = le.fit_transform(df['genre'])

# Normalize numerical features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['No. of Episodes', 'Average Rating', 'Community Members']] = scaler.fit_transform(df[['No. of Episodes', 'Average Rating', 'Community Members']])

KeyError: "None of [Index(['No. of Episodes', 'Average Rating', 'Community Members'], dtype='object')] are in the [columns]"

# Task 3: Recommendation System

In [90]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime(target_anime, df_features):
    # Calculate cosine similarity between the target anime and all other anime
    similarity_matrix = cosine_similarity(df_features)

    # Get the index of the target anime
    target_index = df_features.index.get_loc(target_anime)

    # Get the similarity scores for the target anime
    similarity_scores = list(enumerate(similarity_matrix[target_index]))

    # Sort the similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar anime (excluding the target anime itself)
    top_similar_anime = [df_features.index[anime] for anime, score in similarity_scores[1:11]]

    return top_similar_anime

# Example usage:
target_anime = 'Fullmetal Alchemist: Brotherhood'
df_features = pd.read_csv(r"C:\Users\sai\OneDrive\Desktop\anime.csv")
if target_anime in df_features.index:
    recommended_anime = recommend_anime(target_anime, df_features)
    print(recommended_anime)
else:
    print(f"The anime '{target_anime}' is not found in the database.")

The anime 'Fullmetal Alchemist: Brotherhood' is not found in the database.


# Task 4: Evaluation

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Compute precision, recall, and F1-score
precision = []
recall = []
f1 = []
for threshold in [0.3, 0.5, 0.7]:
    recommended_anime = []
    for anime in test_df['title']:
        recommended_anime.extend(recommend_anime(anime, df_features, threshold))
    recommended_anime = list(set(recommended_anime))
    
    true_positives = len(set(recommended_anime) & set(test_df['title']))
    false_positives = len(set(recommended_anime) - set(test_df['title']))

KeyError: 'title'

In [108]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer

# Load dataset
file_path = r"C:\Users\sai\OneDrive\Desktop\anime.csv"
df = pd.read_csv(file_path)

# Explore the dataset
print(df.head())
print(df.info())

# Data Preprocessing
# Handle missing values
df.fillna({'genre': 'Unknown', 'number_of_episodes': 0, 'average_rating': 0, 'number_of_community_members': 0}, inplace=True)

# Feature Extraction
# Convert categorical features to numerical features
features = ['genre', 'number_of_episodes', 'average_rating', 'number_of_community_members']
target = 'title'

# One-Hot Encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('genre', OneHotEncoder(), ['genre']),
        ('num', StandardScaler(), ['number_of_episodes', 'average_rating', 'number_of_community_members'])
    ],
    remainder='passthrough'
)

X = df[features]
X_processed = preprocessor.fit_transform(X)

# Create a DataFrame for the processed features
df_processed = pd.DataFrame(X_processed, columns=preprocessor.get_feature_names_out())

# Compute cosine similarity
cosine_sim = cosine_similarity(df_processed)

# Recommendation function
def recommend_anime(target_title, top_n=5):
    if target_title not in df[target].values:
        return f"Anime '{target_title}' not found in the dataset."
    
    # Find the index of the target anime
    idx = df.index[df[target] == target_title].tolist()[0]
    
    # Get the similarity scores for the target anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top_n most similar anime
    sim_scores = sim_scores[1:top_n+1]  # Skip the first score because it's the target anime itself
    
    anime_indices = [i[0] for i in sim_scores]
    similar_anime = df.iloc[anime_indices][target].tolist()
    
    return similar_anime

# Example usage
print(recommend_anime('Naruto', top_n=5))



   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  

KeyError: "['number_of_episodes', 'average_rating', 'number_of_community_members'] not in index"