###                                         Support Vector Classifier Algorithm(SVC) For Subscription as Target

### Hyperparameter Tuning of Support Vector Classifier(SVC) Using Grid Search

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

url = "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
data = pd.read_csv(url, header=None, encoding='latin1')
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language', 'Subscription']
data.columns = column_names
data['combined_features'] = data['director_name'] + ' ' +  data['actor_1_name'] + ' ' + data['actor_2_name'] + ' ' + data['actor_3_name'] + ' ' + data['genres'] + ' ' + data['Language'] + ' ' + data['Age Category'].astype(str) + ' '  + data['Subscription'].astype(str)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = tfidf_matrix
y_subscription =  data['Subscription']
X_train, X_test, y_train_subscription, y_test_subscription = train_test_split(X, y_subscription, test_size=0.25, random_state=42)
 
# Define SVM Model with Grid Search
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}
grid_subscription = GridSearchCV(SVC(), param_grid, cv=5)
grid_subscription.fit(X_train, y_train_subscription)
y_pred_subscription = grid_subscription.predict(X_test)
accuracy_subscription = accuracy_score(y_test_subscription, y_pred_subscription)
print("Subscription Accuracy:", accuracy_subscription)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_label = model.predict(movie_features)[0]
    subscription_indices = data[data['Subscription'] == movie_label].index
    sim_scores = [(i, cosine_sim[idx, i]) for i in subscription_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]] 
    return data.iloc[sim_indices]['Title'].tolist()

#Recommendation
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, grid_subscription, data, tfidf_vectorizer, cosine_sim)
print("Top 10 recommendations for the movie", movie_title, ":", recommended_movies)

Subscription Accuracy: 0.332
Top 10 recommendations for the movie jumanji : ['spider-man 2', 'october sky', 'wimbledon', 'crazy/beautiful', 'eternal sunshine of the spotless mind', 'honey, i shrunk the kids', 'get over it', 'all i wanna do', 'the night listener', 'the crow: salvation']


### Hyperparameter Tuning of Support Vector Classifier(SVC) Using Bayesian Optimization

In [2]:
!pip install optuna
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

url = "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language', 'Subscription']
df.columns = column_names
df['combined_features'] = df['director_name'] + ' ' +  df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df['actor_3_name'] + ' ' + df['genres'] + ' ' + df['Language'] + ' ' + df['Age Category'].astype(str) + ' ' + df['Subscription'].astype(str)

#  TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = tfidf_matrix
y_subscription =  df['Subscription']
X_train, X_test, y_train_subscription, y_test_subscription = train_test_split(X, y_subscription, test_size=0.25, random_state=42)
def objective_subscription(trial):
    C = trial.suggest_float("C", 1e-3, 1e3, log=True)
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly"])
    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    model = SVC(C=C, kernel=kernel, gamma=gamma)
    model.fit(X_train, y_train_subscription)
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test_subscription, pred)
    # negative accuracy for maximization by Optuna
    return -accuracy
study_subscription = optuna.create_study()
study_subscription.optimize(objective_subscription, n_trials=10)
best_params_subscription = study_subscription.best_params 
bayseian_subscription_model = SVC(**best_params_subscription)
bayseian_subscription_model.fit(X_train, y_train_subscription)

y_pred_subscription = bayseian_subscription_model.predict(X_test)
accuracy_subscription = accuracy_score(y_test_subscription, y_pred_subscription)
print("Subscription Accuracy using Bayseian Search:", accuracy_subscription)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_label = model.predict(movie_features)[0]
    subscription_indices = data[data['Subscription'] == movie_label].index
    sim_scores = [(i, cosine_sim[idx, i]) for i in subscription_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]]
    return data.iloc[sim_indices]['Title'].tolist()
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, bayseian_subscription_model, df, tfidf_vectorizer, cosine_sim)
print(recommended_movies)

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-15 12:20:01,806] A new study created in memory with name: no-name-a4da1fe8-c78e-4d66-9b1c-d0ba69b185db
[I 2024-03-15 12:20:29,923] Trial 0 finished with value: -0.332 and parameters: {'C': 0.009274717636934264, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: -0.332.
[I 2024-03-15 12:20:58,678] Trial 1 finished with value: -0.332 and parameters: {'C': 6.685664352206669, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: -0.332.
[I 2024-03-15 12:21:28,024] Trial 2 finished with value: -0.332 and parameters: {'C': 0.5994288545840594, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: -0.332.
[I 2024-03-15 12:22:27,716] Trial 3 finished with value: -0.3237333333333333 and parameters: {'C': 462.0862215351332, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with 

Subscription Accuracy using Bayseian Search: 0.332
['spider-man 2', 'october sky', 'wimbledon', 'crazy/beautiful', 'eternal sunshine of the spotless mind', 'honey, i shrunk the kids', 'get over it', 'all i wanna do', 'the night listener', 'the crow: salvation']


### Hyperparameter Tuning of Support Vector Classifier(SVC) Using Random Search

In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
 
url = "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating','Age category', 'Language', 'Subscription']
df.columns = column_names
df['Rating'] = df['Rating'].replace(np.nan, 3)
df['combined_features'] =  df['director_name'] + ' ' + df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df['actor_3_name'] + ' ' + df['genres'] + ' ' +  df['Language'] + ' ' + df['Age category'].astype(str) + ' ' + df['Subscription'].astype(str)
 
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = tfidf_matrix
y = df['Subscription']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_distributions = {'C': [0.1, 1, 10, 50, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']}
random_search = RandomizedSearchCV(SVC(), param_distributions, cv=5, n_iter=15)  # Adjust n_iter as needed
random_search.fit(X_train, y_train)
y_pred = random_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Subscription Accuracy using Random Search:", accuracy)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_label = model.predict(movie_features)[0]
    subscription_indices = data[data['Subscription'] == movie_label].index
    sim_scores = [(i, cosine_sim[idx, i]) for i in subscription_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]]
    return data.iloc[sim_indices]['Title'].tolist()

# Recommendations
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, random_search, df, tfidf_vectorizer, cosine_sim)
print("Recommended Movies:")
print(recommended_movies)

Subscription Accuracy using Random Search: 0.33266666666666667
Recommended Movies:
['spider-man 2', 'october sky', 'wimbledon', 'crazy/beautiful', 'eternal sunshine of the spotless mind', 'honey, i shrunk the kids', 'get over it', 'all i wanna do', 'the night listener', 'the crow: salvation']


### Hyperparameter Tuning of Support Vector Classifier(SVC) Using Golden Section Search Algorithm

In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

url = "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
movies_data = pd.read_csv(url, header=None, encoding='latin1')
movies_data.drop(movies_data.columns[[-1,-5]], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'Rating', 'Age Category', 'Language']
movies_data.columns = column_names
movies_data['Subscription'] = np.random.choice(['Basic','Standard','Premium'], size=len(movies_data))
movies_data['combined_features'] = movies_data['director_name'] + ' ' + movies_data['actor_1_name'] + ' ' + movies_data['actor_2_name'] + ' ' + movies_data['actor_3_name'] + ' ' + movies_data['genres'] + ' ' + movies_data['Language'] + ' ' + movies_data['Age Category'].astype(str)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = movies_data.drop(['Title'], axis=1)
y = movies_data['Subscription']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

#Golden ratio for parameter optimization
phi = (1 + 5**0.5) / 2
a, b = 0.1, 100
tolerance = 1e-4  #tolerance for convergence
 
# Golden section search loop to find the optimal C parameter for the SVM model
iterations = 0
while True:
    print("interval", abs(b - a) )
    c = b - (b - a) / phi  # Calculate the midpoint c
    d = a + (b - a) / phi  # Calculate the other point d
    # Train SVM models with parameters c and d
    model_c = SVC(C=c)
    model_d = SVC(C=d)
    model_c.fit(X_train_preprocessed, y_train)
    model_d.fit(X_train_preprocessed, y_train)
    # Evaluate models on the test set and update intervals
    accuracy_c = accuracy_score(y_test, model_c.predict(X_test_preprocessed))
    accuracy_d = accuracy_score(y_test, model_d.predict(X_test_preprocessed))
    iterations += 1
    if accuracy_c >= accuracy_d:
        a = c  # If c performs better, move the left bound
    else:
        b = d  # If d performs better, move the right bound
    # Stop when the interval is small enough
    if abs(b - a) < tolerance:
        break
# Use the best C parameter found by golden section search for final model
best_c = (a + b) / 2

model = SVC(C = best_c)
model.fit(X_train_preprocessed, y_train)
y_pred = model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)
print("Subscription Accuracy using Golden Section Search:", accuracy)

def recommend_movies_by_title(model, X_train_preprocessed, y_train_binary, movies_data, movie_title):
    movie_row = movies_data.loc[movies_data['Title'] == movie_title]
    if movie_row.empty:
        print("Movie not found in the dataset.")
        return None
    movie_features = movie_row.drop('Title', axis=1)
    movie_features_preprocessed = preprocessor.transform(movie_features)
    subscription = model.predict(movie_features_preprocessed)[0]
    similarity_scores = cosine_sim[movie_row.index]
    similar_movies_indices = similarity_scores.argsort()[0][-11:-1] 
    similar_movies = movies_data.iloc[similar_movies_indices]['Title']
    return subscription, similar_movies
 
input_movie_title = "jumanji"
subscription, recommended_movies = recommend_movies_by_title(model, X_train_preprocessed, y_train, movies_data, input_movie_title)
if recommended_movies is not None:
    print("Rating for {}: {}".format(input_movie_title, subscription))
    print("Recommended Movies:")
    print(recommended_movies)

interval 99.9
interval 61.7415954761145
interval 38.1584045238855
interval 23.58319095222899
interval 14.57521357165652
interval 9.007977380572484
interval 5.567236191084035
interval 3.4407411894884348
interval 2.1264950015956003
interval 1.3142461878928486
interval 0.8122488137027659
interval 0.5019973741900969
interval 0.3102514395126832
interval 0.19174593467741374
interval 0.11850550483525524
interval 0.0732404298421585
interval 0.04526507499311094
interval 0.027975354849061773
interval 0.01728972014406338
interval 0.010685634704998392
interval 0.0066040854390507775
interval 0.004081549265947615
interval 0.0025225361731173734
interval 0.0015590130928444523
interval 0.000963523080287132
interval 0.0005954900125573204
interval 0.00036803306771560074
interval 0.00022745694482750878
interval 0.0001405761228738811
Subscription Accuracy using Golden Section Search: 1.0
Rating for jumanji: Basic
Recommended Movies:
2759                          hook
6798                     wimbledon
206 