In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [2]:

# Load data from a Parquet file into a DataFrame
df = pd.read_csv('/kaggle/input/weblight-novel-dataset/wn.csv')
df.language.head(99)

0      Chinese
1     Japanese
2      Chinese
3      Chinese
4      Chinese
        ...   
94     Chinese
95     Chinese
96     Chinese
97      Korean
98    Japanese
Name: language, Length: 99, dtype: object

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11770 entries, 0 to 11769
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   novel_id                    11770 non-null  int64 
 1   url                         11770 non-null  object
 2   title                       11770 non-null  object
 3   associated_names            11770 non-null  object
 4   img_url                     11770 non-null  object
 5   showtype                    11700 non-null  object
 6   genres                      11770 non-null  object
 7   tags                        11770 non-null  object
 8   description                 11766 non-null  object
 9   related_series              11770 non-null  object
 10  recommendations             11770 non-null  object
 11  recommendation_lists        11770 non-null  object
 12  rating                      11770 non-null  object
 13  language                    11723 non-null  ob

In [4]:
df.columns.values

array(['novel_id', 'url', 'title', 'associated_names', 'img_url',
       'showtype', 'genres', 'tags', 'description', 'related_series',
       'recommendations', 'recommendation_lists', 'rating', 'language',
       'authors', 'artists', 'year', 'status_coo', 'licensed',
       'translated', 'publishers', 'en_pubs', 'release_frequency',
       'weekly_rank', 'monthly_rank', 'all_time_rank',
       'monthly_rank_reading_list', 'all_time_rank_reading_list',
       'total_reading_list_rank', 'chapters'], dtype=object)

In [5]:
columns_to_remove = ['url', 'associated_names', 'img_url', 'artists', 'licensed', 'translated', 'publishers', 'en_pubs', 'release_frequency', 'chapters']

df = df.drop(columns=columns_to_remove)


In [6]:
df.head()

Unnamed: 0,novel_id,title,showtype,genres,tags,description,related_series,recommendations,recommendation_lists,rating,language,authors,year,status_coo,weekly_rank,monthly_rank,all_time_rank,monthly_rank_reading_list,all_time_rank_reading_list,total_reading_list_rank
0,64494,"After the Breakup, I Went Viral in the Enterta...",Web Novel (CN),['Drama' 'Fantasy' 'Romance' 'Supernatural'],['Amnesia' 'Beautiful Female Lead' 'Clever Pro...,"Luo Ning was a cannon fodder, a female support...","{'related_series': array([], dtype=object), 't...","{'recomendations': array([], dtype=object), 't...","{'list': array([{'title': ""Cross' Library (2)""...","(3.3 / 5.0, 4 votes)",Chinese,['Lan Bai Ge Zi' '蓝白格子'],2020,"Complete (722 chapters; 506 main, rest side st...",#683,#513,#11853,#2628,#12694,225
1,43962,Mahou Sekai no Monogatari,Web Novel (JP),['Fantasy' 'Romance' 'Shoujo'],['Adventurers' 'Elemental Magic' 'Familiars' '...,A world where magic exists.\nThis is the story...,{'related_series': array([{'title': 'I Want to...,"{'recomendations': array([], dtype=object), 't...","{'list': array([], dtype=object), 'total': 0}","(4.0 / 5.0, 4 votes)",Japanese,['Mako' 'まこ'],2020,35 Chapters (Ongoing),#2000,#2000,#15002,#13957,#13903,151
2,25506,I am a Good Man,Web Novel (CN),['Action' 'Adventure' 'Fantasy' 'Historical' '...,['Beautiful Female Lead' 'Businessmen' 'Childc...,"To sum up in a sentence, transmigrating to dif...","{'related_series': array([], dtype=object), 't...",{'recomendations': array([{'recommended_user_c...,{'list': array([{'title': 'Male mc QT/BG- Gran...,"(4.0 / 5.0, 40 votes)",Chinese,['甘米儿'],2018,603 Chapters (Completed),#2000,#8032,#4970,#9052,#6191,1096
3,50373,Scum Girl Practice Manual,Web Novel (CN),['Romance' 'School Life' 'Shoujo'],['Female Protagonist' 'Modern Day' 'Short Stor...,"My goal is, to be a scum woman.\n","{'related_series': array([], dtype=object), 't...",{'recomendations': array([{'recommended_user_c...,{'list': array([{'title': 'Beautiful reads [BG...,"(2.8 / 5.0, 5 votes)",Chinese,['左乐'],2019,8 Chapters (Completed),#2000,#2000,#14045,#13211,#14699,104
4,48291,Unable to Divorce After Signing the Marriage C...,Web Novel (CN),['Psychological' 'Romance' 'Shounen Ai' 'Slice...,['Arranged Marriage' 'Calm Protagonist' 'Cold ...,Lin You He excelled in everything – his looks...,"{'related_series': array([], dtype=object), 't...",{'recomendations': array([{'recommended_user_c...,{'list': array([{'title': 'Really whipped olde...,"(4.0 / 5.0, 64 votes)",Chinese,['Bai Hu Qian Deng' '百户千灯'],2019,99 Main + 12 Extras\n111 Chapters (Completed),#1615,#1755,#3528,#2126,#3208,2467


In [7]:
# Fill missing language values with "English"
df['language'] = df['language'].fillna('English')
df.isnull().sum()

novel_id                         0
title                            0
showtype                        70
genres                           0
tags                             0
description                      4
related_series                   0
recommendations                  0
recommendation_lists             0
rating                           0
language                         0
authors                          0
year                          1573
status_coo                     416
weekly_rank                      0
monthly_rank                     0
all_time_rank                    0
monthly_rank_reading_list        0
all_time_rank_reading_list       0
total_reading_list_rank          0
dtype: int64

In [8]:
def preprocess_text(text):
    if isinstance(text, str):
        # Example text preprocessing steps:
        # 1. Convert to lowercase
        text = text.lower()
        # 2. Remove special characters
        text = ''.join(e for e in text if (e.isalnum() or e.isspace()))
    return text


In [9]:
# Apply text preprocessing to the 'description' column
df['description'] = df['description'].apply(preprocess_text)

# Save the preprocessed DataFrame to a new CSV file
df.to_csv('preprocessed_novels.csv', index=False)





In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Step 1: Create a TF-IDF vectorizer for text data (descriptions, genres, tags)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Define the text columns to include
text_columns = ['title','description', 'genres','showtype', 'tags', 'related_series', 'language', 'authors']

# Ensure these columns are treated as strings (text)
df[text_columns] = df[text_columns].astype(str)

# Combine text columns into one
df['text_data'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Fit the TF-IDF vectorizer on the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_data'])


In [11]:
# Step 2: Compute the cosine similarity between items (content-based filtering)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [12]:
from fuzzywuzzy import fuzz, process

def get_closest_match(title, df):
    matched_title_tuple = process.extractOne(title, df['title'])

    if matched_title_tuple and matched_title_tuple[1] >= 90:
        return matched_title_tuple[0]

    return None


In [13]:

def get_recommendations(title, cosine_sim, df):
    matched_title = get_closest_match(title, df)

    if matched_title:
        idx = df[df['title'] == matched_title].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        novel_indices = [i[0] for i in sim_scores][1:11]
        return df['title'].iloc[novel_indices]

    return "No close match found."


In [14]:
# Example: Get recommendations for a novel
recommended_novels = get_recommendations("Reverend Insanity", cosine_sim, df)
print(recommended_novels)


10792              I Am the Fated Villain
2921                        Grasping Evil
11155                        Sage Monarch
1249                         Supreme Lord
11096                Emperor’s Domination
8601                      Omnipotent Sage
8236              Carefree Path of Dreams
10865                  Infinite Bloodcore
1505     The Regressed Demon Lord is Kind
9211            Worlds’ Apocalypse Online
Name: title, dtype: object


In [15]:
!pip install fuzzywuzzy

