# Kdrama recommendation project

## 1. Install Required Libraries:

You'll need to install libraries such as BeautifulSoup, requests, pandas, scikit-learn, scikit-surprise, and Flask.

In [1]:
!pip install beautifulsoup4 requests pandas scikit-surprise Flask
!pip install textblob  # For sentiment analysis

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357287 sha256=051ad1cba298d15a234e080d69415f8a1ec5e8050153855a9729dd07277242d0
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


## 2.  Data Collection and Preprocessing

### 2.1 Web Scraping with BeautifulSoup

Here's the full code to scrape Kdrama information from MyDramaList, including extracting genres from individual drama pages:

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URL for MyDramaList
base_url = "https://mydramalist.com"

# URL for the Kdrama search results
search_url = "https://mydramalist.com/search?adv=titles&ty=68"

# Send a request to the search results page
response = requests.get(search_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the drama entries
dramas = soup.find_all('div', class_='box')

# Extract data and store it in a list
drama_data = []

for drama in dramas:
    # Find the title and link to the individual drama page
    title_tag = drama.find('h6', class_='text-primary title')
    title = title_tag.find('a').text.strip() if title_tag else 'Unknown'

    # Get the link to the individual drama page
    drama_link = title_tag.find('a')['href'] if title_tag else None
    full_drama_url = base_url + drama_link if drama_link else None

    # Default genres to unknown, will be updated later
    genres = 'Unknown'

    if full_drama_url:
        # Visit the individual drama page to extract genres
        try:
            drama_response = requests.get(full_drama_url)
            drama_soup = BeautifulSoup(drama_response.content, 'html.parser')

            # Find the genres from the individual drama page
            genre_elements = drama_soup.select('li.show-genres a')
            genres = ', '.join([genre.text.strip() for genre in genre_elements])

            # Optional: Add a delay to avoid overloading the server
            time.sleep(1)  # Sleep for 1 second between requests
        except Exception as e:
            print(f"Error fetching details from {full_drama_url}: {e}")

    # Find the rating
    rating_tag = drama.find('span', class_='score')
    rating = float(rating_tag.text.strip()) if rating_tag else 0.0

    # Append the collected data
    drama_data.append({
        'title': title,
        'genres': genres,
        'rating': rating
    })

# Convert the data into a DataFrame
df = pd.DataFrame(drama_data)

# Display the DataFrame
print(df.head())

# Save the data to a CSV file for later use
df.to_csv('kdramas.csv', index=False)

# Download the CSV file in Colab
from google.colab import files
files.download('kdramas.csv')


                        title                           genres  rating
0        Twinkling Watermelon   Romance, Youth, Drama, Fantasy     9.2
1              Move to Heaven                      Life, Drama     9.1
2           Weak Hero Class 1             Action, Youth, Drama     9.1
3  Hospital Playlist Season 2    Romance, Life, Drama, Medical     9.1
4               Lovely Runner  Music, Comedy, Romance, Fantasy     9.1


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### 2.2  Data Cleaning and Feature Engineering

In [15]:
# Load the CSV file
df = pd.read_csv('kdramas.csv')

# Remove duplicates
df.drop_duplicates(subset='title', inplace=True)

# Handle missing values
df.fillna({'genres': 'Unknown', 'rating': 0}, inplace=True)

# Normalize text
df['genres'] = df['genres'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

# One-hot encode the genres
df = pd.concat([df, df['genres'].str.get_dummies(sep=',')], axis=1)

# Display the cleaned data
print(df.head())

# Simulate user data
user_ids = [f"user_{i}" for i in range(1, 101)]  # 100 dummy users

# Create a new DataFrame for user-item interactions
user_item_data = []

for index, row in df.iterrows():
    for user_id in user_ids:
        # Randomly assign a rating between 1 and 10
        user_rating = random.uniform(1, 10)
        user_item_data.append((user_id, row['title'], user_rating))

# Convert to DataFrame
user_item_df = pd.DataFrame(user_item_data, columns=['user_id', 'title', 'rating'])

# Display the user-item interactions
print(user_item_df.head())

# Use user-item_df for collaborative filtering


                        title                        genres  rating  \
0        Twinkling Watermelon   romance youth drama fantasy     9.2   
1              Move to Heaven                    life drama     9.1   
2           Weak Hero Class 1            action youth drama     9.1   
3  Hospital Playlist Season 2    romance life drama medical     9.1   
4               Lovely Runner  music comedy romance fantasy     9.1   

   action historical romance fantasy  action thriller mystery supernatural  \
0                                  0                                     0   
1                                  0                                     0   
2                                  0                                     0   
3                                  0                                     0   
4                                  0                                     0   

   action youth drama  comedy romance life youth  \
0                   0                          0   


## 3.  Build the Recommendation System

### 3.1 Collaborative Filtering with Surprise

Let's update the collaborative filtering code using the user_item_df DataFrame.

In [17]:
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

# Create a reader for the data
reader = Reader(rating_scale=(1, 10))

# Load data into Surprise
data = Dataset.load_from_df(user_item_df[['user_id', 'title', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Define the algorithm
algo = KNNBasic()

# Train the algorithm on the training set
algo.fit(trainset)

# Function to get top N recommendations
def get_top_n(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions for each user
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Make predictions on the test set
predictions = algo.test(testset)

# Calculate RMSE for the predictions
accuracy.rmse(predictions)

# Get recommendations for all users in the test set
top_n_recommendations = get_top_n(predictions, n=5)

# List users available in the predictions
available_users = list(top_n_recommendations.keys())
print(f"Available users for recommendations: {available_users[:10]}")  # Print first 10 users

# Choose a sample user from available users
sample_user = available_users[0]  # Use the first available user

print(f"\nTop 5 recommendations for {sample_user}:")
try:
    for item_id, rating in top_n_recommendations[sample_user]:
        print(f"{item_id} with estimated rating {rating:.2f}")
except KeyError:
    print(f"No recommendations available for user: {sample_user}")


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.6119
Available users for recommendations: ['user_20', 'user_53', 'user_43', 'user_86', 'user_52', 'user_64', 'user_68', 'user_78', 'user_6', 'user_72']

Top 5 recommendations for user_20:
Hidden Love with estimated rating 5.78
Unknown with estimated rating 5.77
My Mister with estimated rating 5.18


### 3.2 Improve the Collaborative Filtering Model

Experiment with Different Algorithms:

Surprise library options: Try using other algorithms such as KNNWithMeans, SVD, or SlopeOne to see if they perform better than KNNBasic.

Tune hyperparameters: Use GridSearchCV from the surprise library to find the optimal parameters for the selected algorithms.

In [18]:
from surprise import SVD, KNNWithMeans
from surprise.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.010],
    'reg_all': [0.02, 0.1]
}

# Perform grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Best RMSE score
print(gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


2.698366148564403
{'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}


### 3.2  Content-Based Filtering with Cosine Similarity

In [20]:
# Check if the title exists in the DataFrame
print("Available titles in the DataFrame:")
print(df['title'].head(10))  # Print first 10 titles for quick inspection

# Check if 'Crash Landing on You' is present
if 'Crash Landing on You' in df['title'].values:
    print("'Crash Landing on You' is available in the DataFrame.")
else:
    print("'Crash Landing on You' is not available in the DataFrame.")


Available titles in the DataFrame:
0          Twinkling Watermelon
1                Move to Heaven
2             Weak Hero Class 1
3    Hospital Playlist Season 2
4                 Lovely Runner
5               Nirvana in Fire
6                Flower of Evil
7              Alchemy of Souls
8                        Moving
9             Hospital Playlist
Name: title, dtype: object
'Crash Landing on You' is not available in the DataFrame.


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer for genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genres'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on content
def get_content_recommendations(title, cosine_sim=cosine_sim):
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 5 similar items
    drama_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[drama_indices]

# Example usage
print(get_content_recommendations('Lovely Runner'))


16    Joy of Life Season 2
10              Reply 1988
15             Hidden Love
0     Twinkling Watermelon
11             Joy of Life
Name: title, dtype: object


# 4. Develop a Hybrid Recommendation System

Combine collaborative and content-based filtering for a more robust recommendation engine:

Hybrid Approach:

*   Use collaborative filtering for personalization and content-based filtering for exploring similar dramas.
*   Aggregate recommendations from both methods, possibly weighting them based on their reliability.







In [29]:
def hybrid_recommendations(user_id, title, n=5):
    collab_recs = [iid for iid, _ in top_n_recommendations.get(user_id, [])]
    content_recs = list(get_content_recommendations(title))
    # Merge recommendations and prioritize unique ones
    hybrid_recs = list(dict.fromkeys(collab_recs + content_recs))
    return hybrid_recs[:n]

# Example usage for a specific user and title
user_id = available_users[0]
drama_title = 'Lovely Runner'
print(hybrid_recommendations(user_id, drama_title))


['Hidden Love', 'Unknown', 'My Mister', 'Joy of Life Season 2', 'Reply 1988']


#  5: Develop the User Interface