###                                         Support Vector Classifier Algorithm(SVC) For Rating as Target

### Hyperarameter Tuning of Support Vector Classifier(SVC) Using Grid Search


In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

url = "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
data = pd.read_csv(url, header=None, encoding='latin1')
data.drop(data.columns[-1], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language']
data.columns = column_names
data['combined_features'] = data['director_name'] + ' ' +  data['actor_1_name'] + ' ' + data['actor_2_name'] + ' ' + data['actor_3_name'] + ' ' + data['genres'] + ' ' + data['Language'] + ' ' + data['Age Category'].astype(str)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combination'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = tfidf_matrix
y = data['Rating'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
 
# Define SVM Model with Grid Search
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}
grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Rating Accuracy using Grid:", accuracy)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combination']])
    movie_rating = model.predict(movie_features)[0]
    similar_indices = [i for i, rating in enumerate(data['Rating']) if abs(rating - movie_rating) < 0.5]
    sim_scores = [(i, cosine_sim[idx, i]) for i in similar_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]] 
    return data.iloc[sim_indices]['Title'].tolist()

grid_best_model = grid.best_estimator_

#Recommendation
movie_title = "jumanji"  
recommended_movies = recommend(movie_title, grid_best_model, data, tfidf_vectorizer, cosine_sim)
print(recommended_movies)

Rating Accuracy using Grid: 0.6506666666666666
['dick', 'spider-man 2', 'spider-man 3', 'wimbledon', 'crazy/beautiful', 'elizabethtown', "the cat's meow", 'eternal sunshine of the spotless mind', 'homicide', 'jurassic park iii']


### Hyperarameter Tuning of Support Vector Classifier(SVC) Using Bayesian Search

In [13]:
!pip install optuna
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

url = "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
df.drop(df.columns[-1], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language']
df.columns = column_names
df['combined_features'] = df['director_name'] + ' ' +  df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df['actor_3_name'] + ' ' + df['genres'] + ' ' + df['Language'] + ' ' + df['Age Category'].astype(str)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = tfidf_matrix
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

def objective(trial):
    C = trial.suggest_float("C", 1e-3, 1e3, log=True)
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly"])
    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    model = SVC(C=C, kernel=kernel, gamma=gamma)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    # negative accuracy for maximization by Optuna
    return -accuracy

# Study and run optimization:
study = optuna.create_study()
study.optimize(objective, n_trials=20)
best_params = study.best_params
bayseian_rating_model = SVC(**best_params)
bayseian_rating_model.fit(X_train, y_train)
y_pred = bayseian_rating_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Rating Accuracy using Bayseian Search:", accuracy)
def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_rating = model.predict(movie_features)[0]
    similar_indices = [i for i, rating in enumerate(data['Rating']) if abs(rating - movie_rating) < 0.5]
    sim_scores = [(i, cosine_sim[idx, i]) for i in similar_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]] 
    return data.iloc[sim_indices]['Title'].tolist()

#Recommendation
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, bayseian_rating_model, df, tfidf_vectorizer, cosine_sim)
print(recommended_movies)


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable


[I 2024-03-05 00:13:27,529] A new study created in memory with name: no-name-5befa294-106c-4c39-b0e8-52857adf4cb3
[I 2024-03-05 00:13:49,171] Trial 0 finished with value: -0.6506666666666666 and parameters: {'C': 0.34837506447216693, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 0 with value: -0.6506666666666666.
[I 2024-03-05 00:14:23,067] Trial 1 finished with value: -0.6506666666666666 and parameters: {'C': 0.9340356825274599, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: -0.6506666666666666.
[I 2024-03-05 00:15:09,364] Trial 2 finished with value: -0.6506666666666666 and parameters: {'C': 0.003404863304124994, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 0 with value: -0.6506666666666666.
[I 2024-03-05 00:16:21,860] Trial 3 finished with value: -0.6506666666666666 and parameters: {'C': 0.010182601836105419, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: -0.6506666666666666.
[I 2024-03-05 00:17:57,459] Trial 4 finished with value: -0.650

Rating Accuracy using Bayseian Search: 0.6512
['small soldiers', 'spider-man 2', 'spider-man 3', 'wimbledon', 'crazy/beautiful', 'elizabethtown', "the cat's meow", 'eternal sunshine of the spotless mind', 'homicide', 'jurassic park iii']


###  Hyperparameter Tuning of Support Vector Classifier(SVC) Using Random Search

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import uniform
 
url = "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
df.drop(df.columns[-1], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language']
df.columns = column_names
df['combined_features'] = df['director_name'] + ' ' +  df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df['actor_3_name'] + ' ' + df['genres'] + ' ' + df['Language'] + ' ' + df['Age Category'].astype(str)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
 
X = tfidf_matrix
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
param_distributions = {'C': [0.1, 1, 10, 50, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']}
random_search = RandomizedSearchCV(SVC(), param_distributions, cv=5, n_iter=20)
random_search.fit(X_train, y_train)
y_pred = random_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_rating = model.predict(movie_features)[0]
    similar_indices = [i for i, rating in enumerate(data['Rating']) if abs(rating - movie_rating) < 0.5]
    sim_scores = [(i, cosine_sim[idx, i]) for i in similar_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]]
    return data.iloc[sim_indices]['Title'].tolist()

best_model = random_search.best_estimator_
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, best_model, df, tfidf_vectorizer, cosine_sim)
print(recommended_movies)

Accuracy: 0.648
['small soldiers', 'spider-man 2', 'spider-man 3', 'wimbledon', 'crazy/beautiful', 'elizabethtown', "the cat's meow", 'eternal sunshine of the spotless mind', 'homicide', 'jurassic park iii']


### Hyperparameter Tuning of Support Vector Classifier(SVC) Using Golden Section Search Algorithm

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
 
url= "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
movies_data = pd.read_csv(url, header=None, encoding='latin1')
movies_data.drop(movies_data.columns[[-1]], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating',  'Age Category', 'Language']
movies_data.columns = column_names
movies_data['combined_features'] = movies_data['director_name'] + ' ' +  movies_data['actor_1_name'] + ' ' + movies_data['actor_2_name'] + ' ' + movies_data['actor_3_name'] + ' ' + movies_data['genres'] + ' ' +  movies_data['Age Category'].astype(str) + ' ' + movies_data['Language'] 

X = movies_data.drop('Title', axis=1)
y = movies_data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Define transformers for numeric and categorical data
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
movies_data['combined_features'] = movies_data.apply(lambda row: ' '.join([str(row['genres']), str(row['director_name']), str(row['actor_1_name']), str(row['actor_2_name']), str(row['actor_3_name'])]), axis=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#golden ratio for parameter optimization
phi = (1 + 5**0.5) / 2
a, b = 0.1, 100
tolerance = 1e-4  #tolerance for convergence
iterations = 0
while True:
    c = b - (b - a) / phi  # Calculate the midpoint c
    d = a + (b - a) / phi  # Calculate the other point d
    # Train SVM models with parameters c and d
    model_c = SVC(C=c)
    model_d = SVC(C=d)
    model_c.fit(X_train_preprocessed, y_train)
    model_d.fit(X_train_preprocessed, y_train)
    # Evaluate models on the test set and update intervals
    accuracy_c = accuracy_score(y_test, model_c.predict(X_test_preprocessed))
    accuracy_d = accuracy_score(y_test, model_d.predict(X_test_preprocessed))
    iterations += 1
    if accuracy_c >= accuracy_d:
        a = c  # If c performs better, move the left bound
    else:
        b = d  # If d performs better, move the right bound
    # Stop when the interval is small enough
    if abs(b - a) < tolerance:
        break
# Use the best C parameter found by golden section search algorithm for final model
best_c = (a + b) / 2

model = SVC()
model.fit(X_train_preprocessed, y_train)
y_pred = model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)
print("Rating Accuracy using Golden Section Search Algorithm:", accuracy)
 
def recommend_movies_by_title(model, X_train_preprocessed, y_train_binary, movies_data, movie_title):
    movie_row = movies_data.loc[movies_data['Title'] == movie_title]
    if movie_row.empty:
        print("Movie not found in the dataset.")
        return None
    movie_features = movie_row.drop('Title', axis=1)
    movie_features_preprocessed = preprocessor.transform(movie_features)
    rating = model.predict(movie_features_preprocessed)[0]
    similarity_scores = cosine_sim[movie_row.index]
    similar_movies_indices = similarity_scores.argsort()[0][-11:-1]
    similar_movies = movies_data.iloc[similar_movies_indices]['Title']
    return rating, similar_movies
 
# Recommendation
input_movie_title = "jumanji"
rating, recommended_movies = recommend_movies_by_title(model, X_train_preprocessed, y_train, movies_data, input_movie_title)
if recommended_movies is not None:
    print("Rating for {}: {}".format(input_movie_title, rating))
    print("Recommended Movies:")
    print(recommended_movies)

Rating Accuracy using Golden Section Search Algorithm: 1.0
Rating for jumanji: 4
Recommended Movies:
2759                          hook
6798                     wimbledon
206     interview with the vampire
1950                   october sky
4332                    spider-man
1607                 the rocketeer
6599                  spider-man 2
9920                  spider-man 3
1460                small soldiers
2156                          dick
Name: Title, dtype: object
