This one is knn

In [4]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# --- Configuration ---
# Path to your main dataset
GLOBAL_CSV_PATH = os.path.join("Datasets", "Holy_Dataset", "anilist_anime_data_complete.csv")

# 1. Load Data
if os.path.exists(GLOBAL_CSV_PATH):
    df_global = pd.read_csv(GLOBAL_CSV_PATH)
    print(f"‚úÖ Loaded Data. Shape: {df_global.shape}")
else:
    print("‚ùå File not found. Please check the path.")

‚úÖ Loaded Data. Shape: (20099, 62)


it looks for "Statistically Similar" shows. For example, it will match a popular, high-rated, long-running show with other popular, high-rated, long-running shows, regardless of whether one is about ninjas and the other is about pirates.

In [5]:
# 2. Select & Clean Numerical Features
# We filter for purely statistical metrics
numeric_cols = ['averageScore', 'popularity', 'episodes', 'duration', 'seasonYear']

# Filter the dataframe
knn_df = df_global[numeric_cols].copy()

# Handle Missing Values
# We use 'median' because it's safer for skewed data (like episode counts)
for col in knn_df.columns:
    knn_df[col] = knn_df[col].fillna(knn_df[col].median())

print("‚úÖ Features selected and cleaned.")
print(knn_df.head(3))

‚úÖ Features selected and cleaned.
   averageScore  popularity  episodes  duration  seasonYear
0          62.0        2046       1.0      38.0      1962.0
1          57.0         409      96.0      25.0      1965.0
2          61.0        2449      52.0      23.0      1965.0


In [6]:
# 3. Standard Scaling
# This transforms data so the mean is 0 and standard deviation is 1
scaler = StandardScaler()

# Convert dataframe to a Numpy matrix
knn_matrix = scaler.fit_transform(knn_df)

print(f"‚úÖ Data Scaled. Matrix Shape: {knn_matrix.shape}")

‚úÖ Data Scaled. Matrix Shape: (20099, 5)


In [7]:
# 4. Train Model
# n_neighbors=11 means we look for the 10 closest matches (plus the original)
model = NearestNeighbors(n_neighbors=11, algorithm='brute', metric='euclidean')
model.fit(knn_matrix)

print("‚úÖ Statistical KNN Model Trained!")

‚úÖ Statistical KNN Model Trained!


In [8]:
# 5. Define Inference Function
def recommend_by_stats(anime_title):
    # Case-insensitive search
    mask = df_global['title_english'].str.contains(anime_title, case=False, na=False)
    
    if not mask.any():
        return f"‚ùå Anime '{anime_title}' not found."
    
    # Get the index of the first match
    idx = mask.idxmax()
    found_title = df_global.loc[idx, 'title_english']
    
    print(f"üìä Finding shows statistically similar to: '{found_title}'")
    
    # Get neighbors
    distances, indices = model.kneighbors([knn_matrix[idx]])
    
    # Format results
    results = []
    # Skip i=0 because that's the query anime itself
    for i in range(1, len(distances.flatten())):
        neighbor_idx = indices.flatten()[i]
        dist = distances.flatten()[i]
        
        row = df_global.iloc[neighbor_idx]
        results.append({
            'Title': row['title_english'],
            'Score': row['averageScore'],
            'Pop': row['popularity'],
            'Year': row['seasonYear'],
            'Dist': round(dist, 4)
        })
        
    return pd.DataFrame(results)

In [15]:
# 6. Test
# Try finding shows similar to 'Attack on Titan' (Expect High Pop, High Score, recent years)
recommend_by_stats("The Melancholy of Haruhi Suzumiya")

üìä Finding shows statistically similar to: 'The Melancholy of Haruhi Suzumiya'


Unnamed: 0,Title,Score,Pop,Year,Dist
0,Baccano!,81.0,175602,2007.0,0.5345
1,When They Cry,75.0,162006,2006.0,0.5388
2,Spice and Wolf,80.0,165819,2008.0,0.5615
3,Darker than Black,77.0,159892,2007.0,0.5695
4,Kokoro Connect,75.0,173674,2012.0,0.5915
5,FLCL,78.0,186175,2000.0,0.6407
6,Nisemonogatari,79.0,172764,2012.0,0.6662
7,Welcome to the N-H-K,82.0,174631,2006.0,0.6767
8,Daily Lives of High School Boys,80.0,176515,2012.0,0.698
9,A Certain Magical Index,70.0,173236,2008.0,0.7075
