# Simplified Anime Rating Model

### Import necessary libraries

In [3]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# to split the data into train and test
from sklearn.model_selection import train_test_split

# to build linear regression_model
from sklearn.linear_model import LinearRegression

# to check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
# loading the dataset
# import the reviews into a dataframe - use your directory
import pandas as pd
df = pd.read_csv('Data/anime_data.csv')

In [5]:
# Drop title feature
df.drop(['title'], axis=1, inplace=True)

In [6]:
# Drop description feature
df.drop(['description'], axis=1, inplace=True)

Duration data is heavily skewed right. Will leave outliers for now and imput nulls as median.

In [8]:
# IMPUTE NULLS AS MEDIAN VALUE
def impute_median(df, column):
    # Check if the column contains only numeric data
    if pd.api.types.is_numeric_dtype(df[column]):
        # Calculate the median of the column, ignoring NaNs
        median = df[column].median()
        
        # Count the number of nulls in the column before imputation
        null_count_before = df[column].isnull().sum()
        
        # Impute the median value for all null values in the column
        df[column].fillna(median, inplace=True)
        
        # Print the number of nulls imputed and the median value used
        if null_count_before > 0:
            print(f"Column '{column}': Imputed {null_count_before} nulls with median value {median}.")
        else:
            print(f"Skipped. No nulls to impute in: {column}")
    else:
        # Print the name of the column that was skipped because it's not numeric
        print(f"Skipped non-numeric column: {column}")
    
    return df

# Example usage
# df = impute_median(df, 'COLUMN_TO_IMPUTE')


In [9]:
# Call function to impute using median
df = impute_median(df, 'duration')

Column 'duration': Imputed 4636 nulls with median value 8.0.


In [10]:
# Replace is_missing with TV
df['mediaType'] = df['mediaType'].replace('is_missing', 'TV')
# Drop ongoing feature
df.drop(['ongoing'], axis=1, inplace=True)

In [11]:
# Sort Rating to be First Column
column_name = 'rating'  # Replace 'your_column_name_here' with the actual column name
columns = [column_name] + [col for col in df.columns if col != column_name]
df = df[columns]

In [12]:
# creating a list of non-tag columns
corr_cols = [item for item in df.columns if "tag" not in item]
print(corr_cols)

['rating', 'mediaType', 'eps', 'duration', 'sznOfRelease', 'years_running', 'studio_primary', 'studios_colab', 'contentWarn', 'watched', 'watching', 'wantWatch', 'dropped', 'votes']


#### Encode Features 'mediaType', 'sznOfRelease', and 'studio_primary'

In [14]:
# Initialize a global dictionary to store mappings for each encoded column
column_mappings = {}

# Encode Categorical Features
def encode_column(df, column_name, drop_orig_columns=False):
    global column_mappings  # Reference the global dictionary
    
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        print(f"Column '{column_name}' not found in DataFrame. Skipping...")
        return df
    
    # Get unique values and sort them to ensure consistent mapping
    unique_values = sorted(df[column_name].unique())
    
    # Create a mapping dictionary from unique value to an integer code
    mapping_dict = {val: idx for idx, val in enumerate(unique_values)}
    
    # Create the encoded column name
    encoded_column_name = f'encoded_{column_name}'
    
    # Apply the mapping to create a new encoded column
    df[encoded_column_name] = df[column_name].replace(mapping_dict)
    
    # Drop the original column
    if drop_orig_columns == True:
        df.drop(columns=[column_name], inplace=True)
    
    # Store the mapping using the encoded column name as a reference
    column_mappings[encoded_column_name] = mapping_dict

    print(f"Created encoded_{column}")
    print(f"Encoded as {column_mappings[encoded_column_name]}")
    print()

    return df

# Example usage
# Assuming 'df' is your DataFrame and 'column_to_encode' is the column you want to encode
# df = encode_column(df, 'education_level')
# print(column_mappings['education_level'])  # Access the stored mapping


In [15]:
# Create list of columns to encode
Encode_Categorical_Features = ['mediaType', 'sznOfRelease', 'studio_primary']

# Run Encode Function
for column in Encode_Categorical_Features:
    df = encode_column(df, column, drop_orig_columns=True)

Created encoded_mediaType
Encoded as {'DVD Special': 0, 'Movie': 1, 'Music Video': 2, 'OVA': 3, 'Other': 4, 'TV': 5, 'TV Special': 6, 'Web': 7}

Created encoded_sznOfRelease
Encoded as {'Fall': 0, 'Spring': 1, 'Summer': 2, 'Winter': 3, 'is_missing': 4}

Created encoded_studio_primary
Encoded as {'A-1 Pictures': 0, 'AIC': 1, 'Bones': 2, 'DLE': 3, 'GONZO': 4, 'J.C. Staff': 5, 'Kyoto Animation': 6, 'MADHOUSE': 7, 'Nippon Animation': 8, 'OLM': 9, 'Others': 10, 'Production I.G': 11, 'Shaft': 12, 'Shin-Ei Animation': 13, 'Studio Deen': 14, 'Studio Pierrot': 15, 'Sunrise': 16, 'TMS Entertainment': 17, 'Tatsunoko Production': 18, 'Toei Animation': 19, 'XEBEC': 20, 'is_missing': 21}



In [64]:
df.head()

Unnamed: 0,rating,eps,duration,years_running,studios_colab,contentWarn,watched,watching,wantWatch,dropped,votes,tag_Based_on_a_Manga,tag_Comedy,tag_Action,tag_Fantasy,tag_Sci_Fi,tag_Shounen,tag_Original_Work,tag_Non_Human_Protagonists,tag_Drama,tag_Adventure,tag_Family_Friendly,tag_Short_Episodes,tag_School_Life,tag_Romance,tag_Shorts,tag_Slice_of_Life,tag_Seinen,tag_Supernatural,tag_Magic,tag_Animal_Protagonists,tag_Ecchi,tag_Mecha,tag_Based_on_a_Light_Novel,tag_CG_Animation,tag_Superpowers,tag_Others,tag_missing,encoded_mediaType,encoded_sznOfRelease,encoded_studio_primary
0,4.702,64,8.0,1,0,1,103707.0,14351,25810,2656,86547,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,2
1,4.663,1,107.0,0,0,0,58831.0,1453,21733,124,43960,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,4,10
2,4.661,1,130.0,0,0,1,45892.0,946,17148,132,33752,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,6
3,4.66,10,8.0,0,0,0,25134.0,2183,8082,167,17422,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,11
4,4.65,10,8.0,0,0,1,21308.0,3217,7864,174,15789,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,10


## Model Building

### Define independent and dependent variables

In [18]:
X = df.drop(["rating"], axis=1)
y = df["rating"]

In [19]:
X.shape

(12101, 40)

### Split the data into train and test

In [21]:
# splitting the data in 70:30 ratio for train to test data

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [22]:
print("Number of rows in train data =", x_train.shape[0])
print("Number of rows in test data =", x_test.shape[0])

Number of rows in train data = 8470
Number of rows in test data = 3631


### Fitting a linear model

In [41]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(x_train, y_train)

In [45]:
import joblib
joblib.dump(lin_reg_model, 'anime_model.joblib')

['anime_model.joblib']