# Training du dataset

In [149]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import numpy as np

In [138]:
df = pd.read_csv('../dataset/Anime_data_cleaned.csv')
df.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating
0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81
1,5,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']","Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41
2,6,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...","Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31


In [42]:
df['Type'].dtype

dtype('O')

In [139]:
features = ["Title", "Genre", "Synopsis", "Type", "Producer", "Studio"]
X = df[features]
y = df['Rating']

In [140]:
title_vectorizer = CountVectorizer(stop_words='english')
synopsis_vectorizer = TfidfVectorizer(stop_words='english')

title_matrix = title_vectorizer.fit_transform(X['Title'])
synopsis_matrix = synopsis_vectorizer.fit_transform(X['Synopsis'])

# Obtenir les noms des caractéristiques à partir des vectoriseurs
title_feature_names = title_vectorizer.get_feature_names_out()
synopsis_feature_names = synopsis_vectorizer.get_feature_names_out()

X['Genre'] = pd.factorize(X['Genre'])[0]
X['Type'] = pd.factorize(X['Type'])[0]
X['Studio'] = pd.factorize(X['Studio'])[0]
X['Producer'] = pd.factorize(X['Producer'])[0]


lb_encoder = LabelEncoder()
genre_encoded = lb_encoder.fit_transform(X['Genre'])
type_encoded = lb_encoder.fit_transform(X['Type'])
studio_encoded = lb_encoder.fit_transform(X['Studio'])
producer_encoded = lb_encoder.fit_transform(X['Producer'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Genre'] = pd.factorize(X['Genre'])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Type'] = pd.factorize(X['Type'])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Studio'] = pd.factorize(X['Studio'])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [127]:
# Save the model into a numpy array
np.save('../models/categorical_encoded.npy', lb_encoder.classes_)

In [104]:
print(f"title_matrix shape: {title_matrix.shape}")
print(f"synopsis_matrix shape: {synopsis_matrix.shape}")
print(f"genre_encoded shape: {genre_encoded.shape}")
print(f"type_encoded shape: {type_encoded.shape}")
print(f"studio_encoded shape: {studio_encoded.shape}")
print(f"producer_encoded shape: {producer_encoded.shape}")

title_matrix shape: (4600, 5943)
synopsis_matrix shape: (4600, 27046)
genre_encoded shape: (4600,)
type_encoded shape: (4600,)
studio_encoded shape: (4600,)
producer_encoded shape: (4600,)


In [141]:
X_vector = pd.concat([pd.DataFrame(title_matrix.toarray(), columns=title_feature_names), 
                      pd.DataFrame(synopsis_matrix.toarray(), columns=synopsis_feature_names)], axis=1)

X_categorical = pd.concat([pd.DataFrame(X['Genre'], columns=['Genre']), 
                           pd.DataFrame(X['Type'], columns=['Type']),
                           pd.DataFrame(X['Studio'], columns=['Studio']),
                           pd.DataFrame(X['Producer'], columns=['Producer'])], axis=1)

In [106]:
X_vector.shape

(4600, 32989)

In [107]:
X_categorical.shape

(4600, 4)

In [86]:
X_vector.head(2)

Unnamed: 0,00,003,0079,0080,0083,009,0096,01,02,03,...,小沢昭巳,川村元気,文さん,益子悠紀,眉村,神宮寺葉子,謎の老婆,道原かつみ,鈴森なんでも相談所,麻城ゆう
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
X_categorical.head(3)

Unnamed: 0,Genre,Type,Studio,Producer
0,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",TV,['Sunrise'],['Bandai Visual']
1,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']",Movie,['Bones'],"['Sunrise', 'Bandai Visual']"
2,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...",TV,['Madhouse'],['Victor Entertainment']


In [142]:
X = pd.concat([X_vector, X_categorical], axis=1)

In [109]:
X.head(2)

Unnamed: 0,00,003,0079,0080,0083,009,0096,01,02,03,...,眉村,神宮寺葉子,謎の老婆,道原かつみ,鈴森なんでも相談所,麻城ゆう,Genre,Type,Studio,Producer
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1


In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [144]:
model = RandomForestRegressor(n_estimators=100, random_state=42)


In [145]:

model.fit(X_train, y_train)


In [146]:
y_pred = model.predict(X_test)

In [150]:
r2_score = r2_score(y_test, y_pred)
mean_squared_error = mean_squared_error(y_test, y_pred)
print(f'R2_score: {r2_score}')
print(f'mean squared error: {mean_squared_error}')

R2_score: 0.35041985376168194
mean squared error: 0.4527226866159478


In [115]:
import joblib
joblib.dump(title_vectorizer,"../models/title_vectorizer.pkl")
joblib.dump(synopsis_vectorizer, "../models/synopsis_vectorize.pkl")

['../models/synopsis_vectorize.pkl']

In [151]:
joblib.dump(model, "../models/rf_model.pkl")

['../models/rf_model.pkl']