In [27]:
# Use this command in a Jupyter Notebook cell to install required libraries
!pip install pandas scikit-learn numpy







In [28]:
!conda install -c conda-forge scikit-surprise --yes



Retrieving notices: ...working... done
Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [29]:
import pandas as pd

# Load the dataset
kdramas_df = pd.read_csv('kdrama.csv')

# Print the column names
print(kdramas_df.columns)


Index(['Name', 'Aired Date', 'Year of release', 'Original Network', 'Aired On',
       'Number of Episodes', 'Duration', 'Content Rating', 'Rating',
       'Synopsis', 'Genre', 'Tags', 'Director', 'Screenwriter', 'Cast',
       'Production companies', 'Rank'],
      dtype='object')


In [30]:
import pandas as pd

# Load the dataset
kdramas_df = pd.read_csv('kdrama.csv')

# Fill missing Synopses with empty strings
kdramas_df['Synopsis'] = kdramas_df['Synopsis'].fillna('')


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the Synopses to a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(kdramas_df['Synopsis'])


In [40]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a reverse map of indices and K-Drama names
indices = pd.Series(kdramas_df.index, index=kdramas_df['Name']).drop_duplicates()

def recommend_kdramas(name, cosine_sim=cosine_sim):
    # Get the index of the K-Drama that matches the name
    if name not in indices:
        return f"No recommendations found for {name}. This K-Drama might not be in the dataset."
    idx = indices[name]

    # Get the pairwise similarity scores of all K-Dramas with that K-Drama
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the K-Dramas based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar K-Dramas
    sim_scores = sim_scores[1:11]

    # Get the K-Drama indices
    kdrama_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar K-Dramas
    return kdramas_df['Name'].iloc[kdrama_indices]

# Example usage - Replace 'Your K-Drama Name Here' with a name from your dataset
print(recommend_kdramas('Reply 1988'))


64                 Dr. Romantic
128                       Blind
228                 Light on Me
134    The World of the Married
95                      Save Me
101                Nobody Knows
22                       Healer
52                    Defendant
195                 Oh My Ghost
161        My Unfamiliar Family
Name: Name, dtype: object


In [42]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the dataset
kdramas_df = pd.read_csv('kdrama.csv')

# Create a combined string of director, screenwriter, and cast
kdramas_df['staff'] = kdramas_df['Director'] + ' ' + kdramas_df['Screenwriter'] + ' ' + kdramas_df['Cast']

# Creating the 'combined_features' for content-based filtering
kdramas_df['combined_features'] = kdramas_df['Synopsis'] + ' ' + kdramas_df['staff'] + ' ' + kdramas_df['Genre'] + ' ' + kdramas_df['Tags']

# Initialize TF-IDF Vectorizer and compute the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(kdramas_df['combined_features'])

# Compute the cosine similarity matrix based on the 'combined_features'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Reverse mapping of indices and K-Drama names
indices = pd.Series(kdramas_df.index, index=kdramas_df['Name']).drop_duplicates()

# Recommendation function
def recommend_kdramas(name, user_content_rating=None, year=None, top_n=10):
    # Get the index of the K-Drama that matches the name
    if name not in indices:
        return f"No recommendations found for {name}. This K-Drama might not be in the dataset."
    
    idx = indices[name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Get top_n recommendations
    
    # Get the K-Drama indices
    kdrama_indices = [i[0] for i in sim_scores]
    
    # Start with the top similar K-Dramas
    recommended_kdramas = kdramas_df.loc[kdrama_indices].copy()
    
    # Filter based on 'Content Rating' if specified
    if user_content_rating:
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Content Rating'] <= user_content_rating]
    
    # Filter based on 'Year of release' if specified
    if year:
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Year of release'] == year]
    
    # Return top N recommendations sorted by 'Rating' and 'Rank'
    return recommended_kdramas.sort_values(by=['Rating', 'Rank'], ascending=[False, True])['Name'].head(top_n)

# Example usage:
# Replace 'Your K-Drama Name Here' with an actual name from your dataset.
# 'user_content_rating' could be an age or maturity level (e.g., '18+', '15+', etc.)
# 'year' could be the release year of interest for the user.
# 'top_n' is the number of recommendations to return.

print(recommend_kdramas('Reply 1988', user_content_rating='15+', year=2015, top_n=5))



ValueError: np.nan is an invalid document, expected byte or unicode string.

In [47]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the dataset
kdramas_df = pd.read_csv('kdrama.csv')

# Ensure all relevant columns are string type and fill NaN values
columns_to_clean = ['Synopsis', 'Director', 'Screenwriter', 'Cast', 'Genre', 'Tags']
for column in columns_to_clean:
    kdramas_df[column] = kdramas_df[column].fillna('').astype(str)

# Create a combined string of director, screenwriter, and cast
kdramas_df['staff'] = kdramas_df['Director'] + ' ' + kdramas_df['Screenwriter'] + ' ' + kdramas_df['Cast']

# Creating the 'combined_features' for content-based filtering
kdramas_df['combined_features'] = kdramas_df['Synopsis'] + ' ' + kdramas_df['staff'] + ' ' + kdramas_df['Genre'] + ' ' + kdramas_df['Tags']

# Initialize TF-IDF Vectorizer and compute the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(kdramas_df['combined_features'])

# Compute the cosine similarity matrix based on the 'combined_features'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Reverse mapping of indices and K-Drama names
indices = pd.Series(kdramas_df.index, index=kdramas_df['Name']).drop_duplicates()

# Interactive recommendation function
def recommend_kdramas_interactive():
    name = input("Name of Drama: ").strip()
    user_content_rating = input("Content Rating (e.g., '15+', '18+ Restricted'): ").strip() or None
    year = input("Year of Release (leave blank if not specific): ").strip()
    year = int(year) if year.isdigit() else None
    top_n = 10
    
    if name not in indices:
        return f"No recommendations found for {name}. This K-Drama might not be in the dataset."
    
    idx = indices[name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Get top_n recommendations
    
    kdrama_indices = [i[0] for i in sim_scores]
    
    recommended_kdramas = kdramas_df.loc[kdrama_indices].copy()
    
    if user_content_rating:
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Content Rating'].str.contains(user_content_rating)]
    
    if year:
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Year of release'] == year]
    
    return recommended_kdramas.sort_values(by=['Rating', 'Rank'], ascending=[False, True])['Name'].head(top_n)

# Now you can run the interactive function directly
recommend_kdramas_interactive()


Name of Drama:  Reply 1988
Content Rating (e.g., '15+', '18+ Restricted'):  
Year of Release (leave blank if not specific):  


2             Hospital Playlist
43              Dear My Friends
85                   Once Again
103             Beautiful World
110                  Reply 1997
114            Fight For My Way
134    The World of the Married
161        My Unfamiliar Family
175          Seasons of Blossom
228                 Light on Me
Name: Name, dtype: object

In [59]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the dataset
kdramas_df = pd.read_csv('kdrama.csv')

# Ensure all relevant columns are string type and fill NaN values
columns_to_clean = ['Synopsis', 'Director', 'Screenwriter', 'Cast', 'Genre', 'Tags']
for column in columns_to_clean:
    kdramas_df[column] = kdramas_df[column].fillna('').astype(str)

# Create a combined string of director, screenwriter, and cast
kdramas_df['staff'] = kdramas_df['Director'] + ' ' + kdramas_df['Screenwriter'] + ' ' + kdramas_df['Cast']

# Creating the 'combined_features' for content-based filtering
kdramas_df['combined_features'] = kdramas_df['Synopsis'] + ' ' + kdramas_df['staff'] + ' ' + kdramas_df['Genre'] + ' ' + kdramas_df['Tags']

# Initialize TF-IDF Vectorizer and compute the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(kdramas_df['combined_features'])

# Compute the cosine similarity matrix based on the 'combined_features'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Reverse mapping of indices and K-Drama names
indices = pd.Series(kdramas_df.index, index=kdramas_df['Name']).drop_duplicates()

# Interactive recommendation function with additional filtering options
def recommend_kdramas_interactive():
    name = input("Name of Drama: ").strip()
    use_personalized = input("Would you like to filter by specific attributes (yes/no)? ").strip().lower()
    
    personalized_filters = {}
    if use_personalized == 'yes':
        personalized_filters['content_rating'] = input("Content Rating (e.g., '15+', '18+ Restricted', leave blank if not specific): ").strip() or None
        personalized_filters['year'] = input("Year of Release (leave blank if not specific): ").strip()
        personalized_filters['year'] = int(personalized_filters['year']) if personalized_filters['year'].isdigit() else None
        personalized_filters['director'] = input("Director's Name (leave blank if not specific): ").strip() or None
        personalized_filters['screenwriter'] = input("Screenwriter's Name (leave blank if not specific): ").strip() or None
        personalized_filters['cast'] = input("Cast Member's Name (leave blank if not specific): ").strip() or None
        personalized_filters['rating'] = input("Minimum Rating (1-10, leave blank if not specific): ").strip()
        personalized_filters['rating'] = float(personalized_filters['rating']) if personalized_filters['rating'].replace('.','',1).isdigit() else None
        personalized_filters['rank'] = input("Maximum Rank (leave blank if not specific): ").strip()
        personalized_filters['rank'] = int(personalized_filters['rank']) if personalized_filters['rank'].isdigit() else None
        
    top_n = 10
    
    if name not in indices:
        return f"No recommendations found for {name}. This K-Drama might not be in the dataset."
    
    idx = indices[name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Get top_n recommendations
    
    kdrama_indices = [i[0] for i in sim_scores]
    recommended_kdramas = kdramas_df.loc[kdrama_indices].copy()
    
    # Apply personalized filters
    if personalized_filters.get('content_rating'):
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Content Rating'].str.contains(personalized_filters['content_rating'], case=False, na=False)]
    if personalized_filters.get('year'):
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Year of release'] == personalized_filters['year']]
    if personalized_filters.get('director'):
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Director'].str.contains(personalized_filters['director'], case=False, na=False)]
    if personalized_filters.get('screenwriter'):
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Screenwriter'].str.contains(personalized_filters['screenwriter'], case=False, na=False)]
    if personalized_filters.get('cast'):
                # Filter by cast member's name
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Cast'].str.contains(personalized_filters['cast'], case=False, na=False)]
    if personalized_filters.get('rating'):
        # Filter by minimum rating
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Rating'] >= personalized_filters['rating']]
    if personalized_filters.get('rank'):
        # Filter by maximum rank
        recommended_kdramas = recommended_kdramas.sort_values('Rank', ascending=True)
        recommended_kdramas = recommended_kdramas[recommended_kdramas['Rank'] <= personalized_filters['rank']]
    
    # Ensure the dataframe is not empty after applying filters
    if recommended_kdramas.empty:
        return "No recommendations found based on the filters provided."
    
    # Return top N recommendations sorted by 'Rating' and 'Rank', if available
    recommended_kdramas = recommended_kdramas.sort_values(by=['Rating', 'Rank'], ascending=[False, True])
    return recommended_kdramas['Name'].head(top_n)

# Run the interactive recommendation function
print(recommend_kdramas_interactive())



Name of Drama:  Reply 1988
Would you like to filter by specific attributes (yes/no)?  no


2             Hospital Playlist
43              Dear My Friends
85                   Once Again
103             Beautiful World
110                  Reply 1997
114            Fight For My Way
134    The World of the Married
161        My Unfamiliar Family
175          Seasons of Blossom
228                 Light on Me
Name: Name, dtype: object


In [60]:
pip install Flask


Note: you may need to restart the kernel to use updated packages.


In [61]:
from flask import Flask, request, render_template
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

app = Flask(__name__)

# Load your dataset here
kdramas_df = pd.read_csv('kdrama.csv')
# Assume all other preprocessing steps have been done here...

@app.route('/', methods=['GET', 'POST'])
def home():
    if request.method == 'POST':
        # Process the form data and return recommendations
        drama_name = request.form.get('drama_name')
        # Insert the logic of your recommendation function here
        # Use the drama_name to get recommendations
        recommendations = recommend_kdramas_interactive(drama_name)  # This needs to be adapted for web usage
        return render_template('results.html', recommendations=recommendations)
    return render_template('index.html')

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
