<a href="https://colab.research.google.com/github/Kugelblitz-26/data_science_prj/blob/main/NovelRecommender_con.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [47]:

# Load data from a Parquet file into a DataFrame
df = pd.read_parquet('train-wn.parquet')
df.language.head(99)

0      Chinese
1     Japanese
2      Chinese
3      Chinese
4      Chinese
        ...   
94     Chinese
95     Chinese
96     Chinese
97      Korean
98    Japanese
Name: language, Length: 99, dtype: object

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11770 entries, 0 to 11769
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   novel_id                    11770 non-null  int64 
 1   url                         11770 non-null  object
 2   title                       11770 non-null  object
 3   associated_names            11770 non-null  object
 4   img_url                     11770 non-null  object
 5   showtype                    11770 non-null  object
 6   genres                      11770 non-null  object
 7   tags                        11770 non-null  object
 8   description                 11770 non-null  object
 9   related_series              11770 non-null  object
 10  recommendations             11770 non-null  object
 11  recommendation_lists        11770 non-null  object
 12  rating                      11770 non-null  object
 13  language                    11723 non-null  ob

In [49]:
df.columns.values

array(['novel_id', 'url', 'title', 'associated_names', 'img_url',
       'showtype', 'genres', 'tags', 'description', 'related_series',
       'recommendations', 'recommendation_lists', 'rating', 'language',
       'authors', 'artists', 'year', 'status_coo', 'licensed',
       'translated', 'publishers', 'en_pubs', 'release_frequency',
       'weekly_rank', 'monthly_rank', 'all_time_rank',
       'monthly_rank_reading_list', 'all_time_rank_reading_list',
       'total_reading_list_rank', 'chapters'], dtype=object)

In [50]:
columns_to_remove = ['url', 'associated_names', 'img_url', 'artists', 'licensed', 'translated', 'publishers', 'en_pubs', 'release_frequency', 'chapters']

df = df.drop(columns=columns_to_remove)


In [51]:
df.head()

Unnamed: 0,novel_id,title,showtype,genres,tags,description,related_series,recommendations,recommendation_lists,rating,language,authors,year,status_coo,weekly_rank,monthly_rank,all_time_rank,monthly_rank_reading_list,all_time_rank_reading_list,total_reading_list_rank
0,64494,"After the Breakup, I Went Viral in the Enterta...",Web Novel (CN),"[Drama, Fantasy, Romance, Supernatural]","[Amnesia, Beautiful Female Lead, Clever Protag...","Luo Ning was a cannon fodder, a female support...","{'related_series': [], 'total': 0}","{'recomendations': [], 'total': 0}","{'list': [{'title': 'Cross' Library (2)', 'url...","(3.3 / 5.0, 4 votes)",Chinese,"[Lan Bai Ge Zi, 蓝白格子]",2020,"Complete (722 chapters; 506 main, rest side st...",#683,#513,#11853,#2628,#12694,225
1,43962,Mahou Sekai no Monogatari,Web Novel (JP),"[Fantasy, Romance, Shoujo]","[Adventurers, Elemental Magic, Familiars, Fema...",A world where magic exists.\nThis is the story...,{'related_series': [{'title': 'I Want to Be a ...,"{'recomendations': [], 'total': 0}","{'list': [], 'total': 0}","(4.0 / 5.0, 4 votes)",Japanese,"[Mako, まこ]",2020,35 Chapters (Ongoing),#2000,#2000,#15002,#13957,#13903,151
2,25506,I am a Good Man,Web Novel (CN),"[Action, Adventure, Fantasy, Historical, Roman...","[Beautiful Female Lead, Businessmen, Childcare...","To sum up in a sentence, transmigrating to dif...","{'related_series': [], 'total': 0}",{'recomendations': [{'recommended_user_count':...,{'list': [{'title': 'Male mc QT/BG- Grant my w...,"(4.0 / 5.0, 40 votes)",Chinese,[甘米儿],2018,603 Chapters (Completed),#2000,#8032,#4970,#9052,#6191,1096
3,50373,Scum Girl Practice Manual,Web Novel (CN),"[Romance, School Life, Shoujo]","[Female Protagonist, Modern Day, Short Story, ...","My goal is, to be a scum woman.\n","{'related_series': [], 'total': 0}",{'recomendations': [{'recommended_user_count':...,"{'list': [{'title': 'Beautiful reads [BG]', 'u...","(2.8 / 5.0, 5 votes)",Chinese,[左乐],2019,8 Chapters (Completed),#2000,#2000,#14045,#13211,#14699,104
4,48291,Unable to Divorce After Signing the Marriage C...,Web Novel (CN),"[Psychological, Romance, Shounen Ai, Slice of ...","[Arranged Marriage, Calm Protagonist, Cold Pro...",Lin You He excelled in everything – his looks...,"{'related_series': [], 'total': 0}",{'recomendations': [{'recommended_user_count':...,{'list': [{'title': 'Really whipped older love...,"(4.0 / 5.0, 64 votes)",Chinese,"[Bai Hu Qian Deng, 百户千灯]",2019,99 Main + 12 Extras\n111 Chapters (Completed),#1615,#1755,#3528,#2126,#3208,2467


In [52]:
# Fill missing language values with "English"
df['language'] = df['language'].fillna('English')
df.isnull().sum()

novel_id                      0
title                         0
showtype                      0
genres                        0
tags                          0
description                   0
related_series                0
recommendations               0
recommendation_lists          0
rating                        0
language                      0
authors                       0
year                          0
status_coo                    0
weekly_rank                   0
monthly_rank                  0
all_time_rank                 0
monthly_rank_reading_list     0
all_time_rank_reading_list    0
total_reading_list_rank       0
dtype: int64

In [53]:

def preprocess_text(text):
    # Example text preprocessing steps:
    # 1. Convert to lowercase
    text = text.lower()
    # 2. Remove special characters
    text = ''.join(e for e in text if (e.isalnum() or e.isspace()))
    # 3. Tokenization (split into words)
    tokens = text.split()
    return ' '.join(tokens)

In [54]:
# Apply text preprocessing to the 'description' column
df['description'] = df['description'].apply(preprocess_text)

# Save the preprocessed DataFrame to a new CSV file
df.to_csv('preprocessed_novels.csv', index=False)





In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Step 1: Create a TF-IDF vectorizer for text data (descriptions, genres, tags)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Define the text columns to include
text_columns = ['title','description', 'genres','showtype', 'tags', 'related_series', 'language', 'authors']

# Ensure these columns are treated as strings (text)
df[text_columns] = df[text_columns].astype(str)

# Combine text columns into one
df['text_data'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Fit the TF-IDF vectorizer on the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_data'])


In [56]:
# Step 2: Compute the cosine similarity between items (content-based filtering)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [57]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ...

def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    # Perform fuzzy matching to find the closest matching title
    matched_title_tuple = process.extractOne(title, df['title'])

    if matched_title_tuple is not None and len(matched_title_tuple) == 2:
        # Extract the matched title and its similarity score
        match_title, score = matched_title_tuple

        if score < 90:
            return "No close match found."

        # Get the index of the closest matching novel
        idx = df[df['title'] == match_title].index[0]

        # Get the pairwise similarity scores of all novels with the input novel
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the novels based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar novels
        sim_scores = sim_scores[1:11]  # Exclude the first item (itself)

        # Get the novel indices
        novel_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar novels
        return df['title'].iloc[novel_indices]
    else:
        return "No close match found."


In [58]:
# Example: Get recommendations for a novel
recommended_novels = get_recommendations("I am the Monarch")
print(recommended_novels)

No close match found.


In [59]:








-predictions = []

for index, row in df.iterrows():
    novel_title = row['title']
    recommendations = get_recommendations(novel_title)
    predictions.append({
        'novel_title': novel_title,
        'recommendations': recommendations.tolist()
    })

# The 'predictions' list will contain recommendations for each novel in the testing dataset.
# Convert the 'predictions' list to a DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the recommendations to a CSV file
predictions_df.to_csv('novel_recommendations2.csv', index=False)






AttributeError: ignored

In [None]:
predictions = []

for index, row in df.iterrows():
    novel_title = row['title']
    recommendations = get_recommendations(novel_title)
    predictions.append({
        'novel_title': novel_title,
        'recommendations': recommendations.tolist()
    })

# The 'predictions' list will contain recommendations for each novel in the testing dataset.
# Convert the 'predictions' list to a DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the recommendations to a CSV file
predictions_df.to_csv('novel_recommendations.csv', index=False)






In [None]:
# Calculate the average RMSE
average_rmse = np.mean(predictions)
print("Average RMSE:", average_rmse)

In [None]:
!pip install fuzzywuzzy