<a href="https://colab.research.google.com/github/Kugelblitz-26/data/blob/main/NovelRecommender_hy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [9]:
!pip install fuzzywuzzy




In [10]:

# Load data from a Parquet file into a DataFrame
df = pd.read_parquet('train-wn.parquet')
test_df = pd.read_parquet("test-wn.parquet")
df.associated_names.head(99)

0                                         [分手后我在娱乐圈爆红了]
1     [Tales from a Magical World, The Story of the ...
2                                               [我是好男人]
3                       [Zhā nǚ xiūliàn shǒucè, 渣女修炼手册]
4     [After I Agreed to Marry, I Can’t Divorce Anym...
                            ...                        
94                                             [重生成导盲犬]
95                                            [就想和你在一起]
96    [MDSB, Đầu quả tim nhi thượng bệnh mỹ nhân, 心尖...
97                                     [악역의 덫에 발목이 걸리면]
98             [Mokuteki wa ikinobiru koto, 目的は生き延びること]
Name: associated_names, Length: 99, dtype: object

In [11]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11770 entries, 0 to 11769
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   novel_id                    11770 non-null  int64 
 1   url                         11770 non-null  object
 2   title                       11770 non-null  object
 3   associated_names            11770 non-null  object
 4   img_url                     11770 non-null  object
 5   showtype                    11770 non-null  object
 6   genres                      11770 non-null  object
 7   tags                        11770 non-null  object
 8   description                 11770 non-null  object
 9   related_series              11770 non-null  object
 10  recommendations             11770 non-null  object
 11  recommendation_lists        11770 non-null  object
 12  rating                      11770 non-null  object
 13  language                    11723 non-null  ob

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11770 entries, 0 to 11769
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   novel_id                    11770 non-null  int64 
 1   url                         11770 non-null  object
 2   title                       11770 non-null  object
 3   associated_names            11770 non-null  object
 4   img_url                     11770 non-null  object
 5   showtype                    11770 non-null  object
 6   genres                      11770 non-null  object
 7   tags                        11770 non-null  object
 8   description                 11770 non-null  object
 9   related_series              11770 non-null  object
 10  recommendations             11770 non-null  object
 11  recommendation_lists        11770 non-null  object
 12  rating                      11770 non-null  object
 13  language                    11723 non-null  ob

In [13]:
df.isnull().sum()

novel_id                       0
url                            0
title                          0
associated_names               0
img_url                        0
showtype                       0
genres                         0
tags                           0
description                    0
related_series                 0
recommendations                0
recommendation_lists           0
rating                         0
language                      47
authors                        0
artists                        0
year                           0
status_coo                     0
licensed                       0
translated                     0
publishers                     0
en_pubs                        0
release_frequency              0
weekly_rank                    0
monthly_rank                   0
all_time_rank                  0
monthly_rank_reading_list      0
all_time_rank_reading_list     0
total_reading_list_rank        0
chapters                       0
dtype: int

In [14]:
columns_to_remove = ['url', 'associated_names', 'img_url', 'artists', 'licensed', 'translated', 'publishers', 'en_pubs', 'release_frequency', 'chapters']

df = df.drop(columns=columns_to_remove)


In [15]:
df.head()

Unnamed: 0,novel_id,title,showtype,genres,tags,description,related_series,recommendations,recommendation_lists,rating,language,authors,year,status_coo,weekly_rank,monthly_rank,all_time_rank,monthly_rank_reading_list,all_time_rank_reading_list,total_reading_list_rank
0,64494,"After the Breakup, I Went Viral in the Enterta...",Web Novel (CN),"[Drama, Fantasy, Romance, Supernatural]","[Amnesia, Beautiful Female Lead, Clever Protag...","Luo Ning was a cannon fodder, a female support...","{'related_series': [], 'total': 0}","{'recomendations': [], 'total': 0}","{'list': [{'title': 'Cross' Library (2)', 'url...","(3.3 / 5.0, 4 votes)",Chinese,"[Lan Bai Ge Zi, 蓝白格子]",2020,"Complete (722 chapters; 506 main, rest side st...",#683,#513,#11853,#2628,#12694,225
1,43962,Mahou Sekai no Monogatari,Web Novel (JP),"[Fantasy, Romance, Shoujo]","[Adventurers, Elemental Magic, Familiars, Fema...",A world where magic exists.\nThis is the story...,{'related_series': [{'title': 'I Want to Be a ...,"{'recomendations': [], 'total': 0}","{'list': [], 'total': 0}","(4.0 / 5.0, 4 votes)",Japanese,"[Mako, まこ]",2020,35 Chapters (Ongoing),#2000,#2000,#15002,#13957,#13903,151
2,25506,I am a Good Man,Web Novel (CN),"[Action, Adventure, Fantasy, Historical, Roman...","[Beautiful Female Lead, Businessmen, Childcare...","To sum up in a sentence, transmigrating to dif...","{'related_series': [], 'total': 0}",{'recomendations': [{'recommended_user_count':...,{'list': [{'title': 'Male mc QT/BG- Grant my w...,"(4.0 / 5.0, 40 votes)",Chinese,[甘米儿],2018,603 Chapters (Completed),#2000,#8032,#4970,#9052,#6191,1096
3,50373,Scum Girl Practice Manual,Web Novel (CN),"[Romance, School Life, Shoujo]","[Female Protagonist, Modern Day, Short Story, ...","My goal is, to be a scum woman.\n","{'related_series': [], 'total': 0}",{'recomendations': [{'recommended_user_count':...,"{'list': [{'title': 'Beautiful reads [BG]', 'u...","(2.8 / 5.0, 5 votes)",Chinese,[左乐],2019,8 Chapters (Completed),#2000,#2000,#14045,#13211,#14699,104
4,48291,Unable to Divorce After Signing the Marriage C...,Web Novel (CN),"[Psychological, Romance, Shounen Ai, Slice of ...","[Arranged Marriage, Calm Protagonist, Cold Pro...",Lin You He excelled in everything – his looks...,"{'related_series': [], 'total': 0}",{'recomendations': [{'recommended_user_count':...,{'list': [{'title': 'Really whipped older love...,"(4.0 / 5.0, 64 votes)",Chinese,"[Bai Hu Qian Deng, 百户千灯]",2019,99 Main + 12 Extras\n111 Chapters (Completed),#1615,#1755,#3528,#2126,#3208,2467


In [16]:
# Fill missing language values with "English"
df['language'] = df['language'].fillna('English')
df.isnull().sum()
test_df['language'] = df['language'].fillna('English')

In [17]:
df.isnull().sum()

novel_id                      0
title                         0
showtype                      0
genres                        0
tags                          0
description                   0
related_series                0
recommendations               0
recommendation_lists          0
rating                        0
language                      0
authors                       0
year                          0
status_coo                    0
weekly_rank                   0
monthly_rank                  0
all_time_rank                 0
monthly_rank_reading_list     0
all_time_rank_reading_list    0
total_reading_list_rank       0
dtype: int64

In [18]:

def preprocess_text(text):
    # Example text preprocessing steps:
    # 1. Convert to lowercase
    text = text.lower()
    # 2. Remove special characters
    text = ''.join(e for e in text if (e.isalnum() or e.isspace()))
    # 3. Tokenization (split into words)
    tokens = text.split()
    return ' '.join(tokens)

In [19]:
# Apply text preprocessing to the 'description' column
df['description'] = df['description'].apply(preprocess_text)
test_df['description'] = test_df['description'].apply(preprocess_text)

# Save the preprocessed DataFrame to a new CSV file
df.to_csv('preprocessed_novels.csv', index=False)
test_df.to_csv('test_preprocessed_novels.csv', index=False)
print(df.head())





   novel_id                                              title  \
0     64494  After the Breakup, I Went Viral in the Enterta...   
1     43962                          Mahou Sekai no Monogatari   
2     25506                                    I am a Good Man   
3     50373                          Scum Girl Practice Manual   
4     48291  Unable to Divorce After Signing the Marriage C...   

         showtype                                             genres  \
0  Web Novel (CN)            [Drama, Fantasy, Romance, Supernatural]   
1  Web Novel (JP)                         [Fantasy, Romance, Shoujo]   
2  Web Novel (CN)  [Action, Adventure, Fantasy, Historical, Roman...   
3  Web Novel (CN)                     [Romance, School Life, Shoujo]   
4  Web Novel (CN)  [Psychological, Romance, Shounen Ai, Slice of ...   

                                                tags  \
0  [Amnesia, Beautiful Female Lead, Clever Protag...   
1  [Adventurers, Elemental Magic, Familiars, Fema...   
2 

In [20]:
'''from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Step 1: Create a TF-IDF vectorizer for text data (descriptions, genres, tags)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Define the text columns to include
text_columns = ['showtype', 'tags', 'related_series', 'recommendations', 'recommendation_lists', 'language', 'authors', 'year', 'status_coo', 'weekly_rank', 'monthly_rank', 'all_time_rank', 'monthly_rank_reading_list', 'all_time_rank_reading_list', 'total_reading_list_rank']

# Ensure these columns are treated as strings (text)
df[text_columns] = df[text_columns].astype(str)

# Combine text columns into one
df['text_data'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Fit the TF-IDF vectorizer on the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_data'])'''


"from sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.pairwise import linear_kernel\n\n# Step 1: Create a TF-IDF vectorizer for text data (descriptions, genres, tags)\ntfidf_vectorizer = TfidfVectorizer(stop_words='english')\n\n# Define the text columns to include\ntext_columns = ['showtype', 'tags', 'related_series', 'recommendations', 'recommendation_lists', 'language', 'authors', 'year', 'status_coo', 'weekly_rank', 'monthly_rank', 'all_time_rank', 'monthly_rank_reading_list', 'all_time_rank_reading_list', 'total_reading_list_rank']\n\n# Ensure these columns are treated as strings (text)\ndf[text_columns] = df[text_columns].astype(str)\n\n# Combine text columns into one\ndf['text_data'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)\n\n# Fit the TF-IDF vectorizer on the text data\ntfidf_matrix = tfidf_vectorizer.fit_transform(df['text_data'])"

In [21]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Step 1: Create a TF-IDF vectorizer for text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Define the text columns to include
text_columns = ['title', 'tags', 'related_series', 'genres', 'description', 'authors','showtype']


# Ensure these text columns are treated as strings (text)
df[text_columns] = df[text_columns].astype(str)

# Combine text columns into one
df['text_data'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Fit the vectorizer on the training dataset
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_data'])

In [22]:


# Step 2: Compute the cosine similarity between items (content-based filtering)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

collabrative

In [23]:
# Extract user ratings (assumed to be in the format "3.3 / 5.0, 4 votes")
# Convert the 'rating' column to strings
df['rating'] = df['rating'].astype(str)


df['rating'] = df['rating'].str.extract('(\d\.\d) / 5\.0')
df['rating'] = df['rating'].astype(float)

# Use user ratings as a measure of user-item interaction
# Replace missing ratings with 0 (no user interaction)
df['rating'].fillna(0, inplace=True)

# Normalize user ratings between 0 and 1
df['rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())

# Set 'novel_id' as the index
df.set_index('novel_id', inplace=True)

# Calculate novel similarities based on user ratings
collaborative_matrix = df[['rating']].fillna(0)  # Select only the 'rating' column
collaborative_similarity = linear_kernel(collaborative_matrix, collaborative_matrix)


In [24]:
'''from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ...

def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    # Perform fuzzy matching to find the closest matching title
    matched_title_tuple = process.extractOne(title, df['title'])

    # Extract the matched title and its similarity score
    match_title, score = matched_title_tuple

    if score < 90:
        return "No close match found."

    # Get the index of the closest matching novel
    idx = df[df['title'] == match_title].index[0]

    # Get the pairwise similarity scores of all novels with the input novel
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the novels based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar novels
    sim_scores = sim_scores[1:11]  # Exclude the first item (itself)

    # Get the novel indices
    novel_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar novels
    return df['title'].iloc[novel_indices]

'''


'from fuzzywuzzy import fuzz\nfrom fuzzywuzzy import process\n\n# ...\n\ndef get_recommendations(title, cosine_sim=cosine_sim, df=df):\n    # Perform fuzzy matching to find the closest matching title\n    matched_title_tuple = process.extractOne(title, df[\'title\'])\n    \n    # Extract the matched title and its similarity score\n    match_title, score = matched_title_tuple\n\n    if score < 90:\n        return "No close match found."\n\n    # Get the index of the closest matching novel\n    idx = df[df[\'title\'] == match_title].index[0]\n\n    # Get the pairwise similarity scores of all novels with the input novel\n    sim_scores = list(enumerate(cosine_sim[idx]))\n\n    # Sort the novels based on the similarity scores\n    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n\n    # Get the scores of the 10 most similar novels\n    sim_scores = sim_scores[1:11]  # Exclude the first item (itself)\n\n    # Get the novel indices\n    novel_indices = [i[0] for i in sim_sc

In [25]:
from fuzzywuzzy import process

def get_hybrid_recommendations(title, cosine_sim, collaborative_similarity, df):
    # Perform fuzzy matching to find the closest matching title
    matched_title_tuple = process.extractOne(title, df['title'])

    # Extract the matched title
    matched_title = matched_title_tuple[0]

    if matched_title_tuple[1] < 90:
        return "No close match found."

    # Check if the matched title exists in the dataset
    if matched_title in df['title'].values:
        # Get the index of the closest matching novel
        idx = df[df['title'] == matched_title].index[0]

        # Content-Based Filtering
        content_based_scores = cosine_sim[idx]

        # Collaborative Filtering
        collaborative_scores = collaborative_similarity[idx]

        # Combine content-based and collaborative filtering scores
        hybrid_scores = (0.9 * content_based_scores + 0.1 * collaborative_scores)

        # Sort the novels based on hybrid scores
        novel_indices = hybrid_scores.argsort()[::-1]

        # Return the top 10 most similar novels
        return df['title'].iloc[novel_indices][:10]
    else:
        return "Novel not found in the dataset."





In [30]:
# Example: Get hybrid recommendations for a novel
hybrid_recommendations = get_hybrid_recommendations("Reverend Insanity", cosine_sim, collaborative_similarity, df)
print(hybrid_recommendations)


novel_id
64838    Transmigrated as a Cannon Fodder Omega, Their ...
58371                                 Hollywood Production
48033     The Absolute Favorite in the Apocalyptic Rebirth
62260                                             Fu Tu Ta
46172                               The Submissive Emperor
42055                First Love Choose Me, I’m Super Sweet
46384    The Demon Lord Is Fleeing For His Life Again T...
3319                                Please Be More Serious
24278                                            Card Room
2301                                        Fated Marriage
Name: title, dtype: object


**End**

In [27]:
'''

# Example: Get recommendations for a novel
recommended_novels = get_recommendations(" I am a Good Man ")
print(recommended_novels)
'''

'\n\n# Example: Get recommendations for a novel\nrecommended_novels = get_recommendations(" I am a Good Man ")\nprint(recommended_novels)\n'

In [29]:
'''predictions = []

for index, row in df.iterrows():
    novel_title = row['title']
    recommendations = get_hybrid_recommendations(novel_title, cosine_sim, collaborative_similarity, df)
    predictions.append({
        'novel_title': novel_title,
        'recommendations': recommendations.tolist()
    })

# The 'predictions' list will contain recommendations for each novel in the testing dataset.
# Convert the 'predictions' list to a DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the recommendations to a CSV file
predictions_df.to_csv('novel_hyrecommendations.csv', index=False)'''

"predictions = []\n\nfor index, row in df.iterrows():\n    novel_title = row['title']\n    recommendations = get_hybrid_recommendations(novel_title, cosine_sim, collaborative_similarity, df)\n    predictions.append({\n        'novel_title': novel_title,\n        'recommendations': recommendations.tolist()\n    })\n\n# The 'predictions' list will contain recommendations for each novel in the testing dataset.\n# Convert the 'predictions' list to a DataFrame\npredictions_df = pd.DataFrame(predictions)\n\n# Save the recommendations to a CSV file\npredictions_df.to_csv('novel_hyrecommendations.csv', index=False)"

In [None]:
from sklearn.metrics import mean_squared_error

predictions = []

for index, row in test_df.iterrows():
    title = row['title']

    if title in df['title'].values:
        recommendations = get_recommendations(title)
        actual_movies = row['actual_recommendations']  # Assuming you have actual recommendations in your test dataset
        mse = mean_squared_error(recommendations, actual_movies)
        predictions.append(mse)
    else:
        # Handle cases where the title is not found in the main dataset
        predictions.append(None)

# Calculate the mean squared error for the available predictions
mean_squared_error([mse for mse in predictions if mse is not None], [actual for actual in recommendation_lists if actual is not None])


In [None]:
# Calculate the average RMSE
average_rmse = np.mean(predictions)
print("Average RMSE:", average_rmse)