<a href="https://colab.research.google.com/github/Hemant2388/Recipe-Rating-Prediction-A-Machine-Learning-Approach/blob/main/recipe_rating_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

recipe_for_rating_predict_food_ratings_using_ml_path = kagglehub.competition_download('recipe-for-rating-predict-food-ratings-using-ml')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')
test = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv")
sample = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/sample.csv")

In [None]:
train.info()

In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

## Train Test Split

In [None]:
y = train['Rating']
X = train.drop(columns=['Rating'])

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state=42)

## 1. Data Cleaning

In [None]:
xtrain.info()

In [None]:
xtrain.isnull().sum()
# xtest.isnull().sum()

In [None]:
xtrain['Recipe_Review'] = xtrain['Recipe_Review'].fillna(value='NA')
xtest['Recipe_Review'] = xtest['Recipe_Review'].fillna(value='NA')

In [None]:
# xtrain.duplicated().sum()
xtest.duplicated().sum()

## 2. Exploratory Data Analysis

In [None]:
ytrain.value_counts()
# ytest.value_counts()

In [None]:
# data is highly imbalanced

In [None]:
xtrain.head()

In [None]:
# The textual column except CommentID and userID doesnt seems to be contributing much

In [None]:
xtrain = xtrain.drop(columns = ['RecipeName', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp'])
xtest = xtest.drop(columns = ['RecipeName', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp'])

In [None]:
# RecipeName, RecipeCode, RecipeNumber seems to be redundant as they are all
# a unique identifier for a specific dish

In [None]:
sns.pairplot(xtrain)

In [None]:
# There is no two columns which have high correlation and data has very few outliers

In [None]:

xtrain_text = xtrain['Recipe_Review']
xtrain_numerical = xtrain.drop(columns = ['Recipe_Review'])
xtest_text = xtest['Recipe_Review']
xtest_numerical = xtest.drop(columns = ['Recipe_Review'])

In [None]:
sns.heatmap(xtrain_numerical.corr(), annot=True)

In [None]:
# since thumbsupcount and thumbsdowncount has high positive correlation with BestScore, hence I am deleting those two

In [None]:
xtrain_numerical = xtrain_numerical.drop(columns = ['ThumbsUpCount', 'ThumbsDownCount'])
xtest_numerical = xtest_numerical.drop(columns = ['ThumbsUpCount', 'ThumbsDownCount'])

In [None]:
xtrain_numerical['RecipeNumber'].unique().shape

In [None]:
xtrain_numerical['RecipeCode'].unique().shape


In [None]:
xtrain_numerical = xtrain_numerical.drop(columns = ['RecipeCode'])
xtest_numerical = xtest_numerical.drop(columns = ['RecipeCode'])

In [None]:
xtrain_text[3]

In [None]:
# converting to lower case
xtrain_text = xtrain_text.str.lower()
xtest_text = xtest_text.str.lower()

In [None]:
import regex

In [None]:
# Removing HTML entities
xtrain_text = xtrain_text.apply(lambda text: regex.sub(r'&#[0-9]+;', '', text))
xtest_text = xtest_text.apply(lambda text: regex.sub(r'&#[0-9]+;', '', text))

In [None]:
# Removing non-alphanumeric characters
xtrain_text = xtrain_text.apply(lambda text: regex.sub(r'[^a-zA-Z0-9\s]', '', text))
xtest_text = xtest_text.apply(lambda text: regex.sub(r'[^a-zA-Z0-9\s]', '', text))

In [None]:
# Tokenization
xtrain_text = xtrain_text.apply(lambda text: text.split())
xtest_text = xtest_text.apply(lambda text: text.split())

In [None]:
# Removing Stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
xtrain_text = xtrain_text.apply(lambda tokens: [word for word in tokens if word not in ENGLISH_STOP_WORDS])
xtest_text = xtest_text.apply(lambda tokens: [word for word in tokens if word not in ENGLISH_STOP_WORDS])

In [None]:
# Joining Tokens Back into Text
xtrain_text = xtrain_text.apply(lambda tokens: ' '.join(tokens))
xtest_text = xtest_text.apply(lambda tokens: ' '.join(tokens))

In [None]:
# Scaling Numerical Features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
xtrain_numerical = scaler.fit_transform(xtrain_numerical)
xtest_numerical = scaler.transform(xtest_numerical)

In [None]:
xtrain_numerical

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
xtrain_vectorized = vectorizer.fit_transform(xtrain_text).toarray()
xtest_vectorized = vectorizer.transform(xtest_text).toarray()

In [None]:
xtest_numerical.shape, xtest_vectorized.shape

In [None]:
xtrain_com = np.hstack([xtrain_numerical, xtrain_vectorized])
xtest_com = np.hstack([xtest_numerical, xtest_vectorized])

## Feature Engineering

In [None]:
#Since the new dataset after vectorizing has too many new columns which will
#cost me a lot of computing time while hypertuning to find suitable models
#hence i going through a feature selection process to save time and try different models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
rf_classifier = RandomForestClassifier()
selector = SelectFromModel(rf_classifier, threshold='mean')
xtrain_selected = selector.fit_transform(xtrain_vectorized, ytrain)
xtest_selected = selector.transform(xtest_vectorized)

In [None]:
xtrain_selected.shape

## Model Building

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
mb = MultinomialNB()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
lr = LogisticRegression()
svc = SVC(kernel='linear', C=1.0)

In [None]:
#  lr.fit(xtrain_selected, ytrain)

In [None]:
#y_pred = lr.predict(xtest_selected)

In [None]:
#accuracy_score(ytest, y_pred)

<em><strong>Accuracy score for different model with their default parameters</strong></em><br>
MultinomialNB Score= 0.7554985337243402 <br>
RandomForestClassifier Score = 0.7642961876832844 <br>
GradianBoostingClassifier = 0.7591642228739003 <br>
LogisticRegression = 0.7639296187683284 <br>
LogisticRegression after feature selection = 0.7650293255131965

## Model 1 : Using RandomForestClassifier

In [None]:
rf.fit(xtrain_selected, ytrain)

In [None]:
rf.score(xtest_selected, ytest)

In [None]:
n_estimators_values = [90,100, 120, 130]
max_depth_values = [None]
min_samples_split_values = [2, 5, 10, 15]
min_samples_leaf_values = [1, 2, 4]

In [None]:
# for i in n_estimators_values:
#     for j in max_depth_values:
#         for k in min_samples_split_values:
#             for p in min_samples_leaf_values:
#                 rf_model = RandomForestClassifier(n_estimators=i, max_depth=j, min_samples_split=k, min_samples_leaf=p)
#                 rf_model.fit(xtrain_selected, ytrain)
#                 y_pred = rf_model.predict(xtest_selected)
#                 accuracy = accuracy_score(ytest, y_pred)
#                 print("Accuracy for n_estimator =",i,"and max_depth =",j,"and min_samples_split",k,"and leaf=",p,"is",accuracy)

In [None]:
from sklearn.model_selection import cross_val_score

# Define your XGBoost model with the chosen hyperparameters
rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=10, min_samples_leaf=1)

# Perform cross-validation
cv_scores = cross_val_score(rf_model, xtrain_selected, ytrain, cv=5, scoring='accuracy')

# # Print the cross-validation scores
# print("Cross-validation scores:", cv_scores)

# # Calculate and print the mean and standard deviation of the cross-validation scores
# print("Mean CV score:", np.mean(cv_scores))
# print("Standard deviation of CV scores:", np.std(cv_scores))

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=10, min_samples_leaf=1)

In [None]:
# rf_model.fit(xtrain_selected, ytrain)

In [None]:
rf_param_dist = {
    'n_estimators': [50, 100, 150, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 50, 60],      # Maximum depth of the tree
    'min_samples_split': [2, 5, 10,20],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],    # Minimum number of samples required to be at a leaf node
}


In [None]:
rf_random_search = RandomizedSearchCV(
    rf, param_distributions=rf_param_dist, cv=5,
    scoring='accuracy', verbose=1, n_iter = 20
)

In [None]:
#rf_random_search.fit(xtrain_vectorized, ytrain)

In [None]:
#rf_random_search.best_params_

<em><strong> Different Random Search best parameters for RandomForestClassifier<strong><em>

{'n_estimators': 100,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_depth': None,
 0.7650293255131965

 {'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_depth': None}
 0.7573313782991202
    
{'n_estimators': 50,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_depth': None}
  0.7591642228739003
    
{'n_estimators': 50,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_depth': None}
0.7624633431085044
    
{'n_estimators': 100,
 'min_samples_split': 20,
 'min_samples_leaf': 1,
 'max_depth': 60}
0.7554985337243402

In [None]:
#best_rf_model = rf_random_search.best_estimator_

In [None]:
#best_rf_model.fit(xtrain_vectorized, ytrain)

In [None]:
#best_rf_model.score(xtest_vectorized, ytest)

In [None]:
# After analysing several random search model

In [None]:
#final_rf = RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_depth = None)

In [None]:
#final_rf.fit(xtrain_vectorized, ytrain)

In [None]:
#final_rf.score(xtest_vectorized, ytest)

## Model 2 : Using LogisticRegression

In [None]:
from scipy.stats import loguniform

In [None]:
lr_param_dist = {
    'C': [0.001, 0.01, 0.1, 1, 1.3, 1.5, 2],  # Regularization parameter
    'penalty': ['l1', 'l2'],       # Regularization penalty
    'solver': ['lbfgs', 'saga', 'sag'], # Optimization algorithm
    'max_iter': [2000], # Maximum number of iterations
    'class_weight': [None, 'balanced'],  # Weighting of classes
    'tol': [1e-3, 1e-4, 1e-5],  # Tolerance for stopping criteria
    'fit_intercept': [True, False]
}

In [None]:
lr_random_search = RandomizedSearchCV(
    estimator=lr,
    param_distributions=lr_param_dist,
    n_iter=50,  # Number of parameter settings that are sampled
    scoring='accuracy',
    cv=2,  # Cross-validation folds
    verbose=2,
    random_state=42,
    n_jobs = -1
)

In [None]:
#lr_random_search.fit(xtrain_selected, ytrain)

In [None]:
# for c in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
#     lr = LogisticRegression(C=c, max_iter=1500)
#     lr.fit(xtrain_selected, ytrain)
#     score = lr.score(xtest_selected, ytest)
#     print("Accuracy for C = ", c, "is", score)

In [None]:
# for c in [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]:
#     lr = LogisticRegression(C=c, max_iter=1500)
#     lr.fit(xtrain_selected, ytrain)
#     score = lr.score(xtest_selected, ytest)
#     print("Accuracy for C = ", c, "is", score)

In [None]:
# for c in [1.3,1.31, 1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4]:
#     lr = LogisticRegression(C=c, max_iter=1500)
#     lr.fit(xtrain_selected, ytrain)
#     score = lr.score(xtest_selected, ytest)
#     print("Accuracy for C = ", c, "is", score)

In [None]:
# for solver in ['sag', 'saga', 'lbfgs']:
#     lr = LogisticRegression(C=1.32, max_iter=3000, solver = solver)
#     lr.fit(xtrain_selected, ytrain)
#     score = lr.score(xtest_selected, ytest)
#     print("Accuracy for Solver = ", solver, "is", score)

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
# base_estimator = LogisticRegression(C=1.32, max_iter=1500)
# for c in [10, 20, 30, 40, 50]:
#     bagging_classifier = BaggingClassifier(estimator=base_estimator, n_estimators=c, random_state=42)
#     bagging_classifier.fit(xtrain_selected, ytrain)
#     score = bagging_classifier.score(xtest_selected, ytest)
#     print("Accuracy:", score)

In [None]:
# There want much change in the score after doing bagging

## Model 3 : Using Xgboost

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier(random_state=42)

In [None]:
#xgb_model.fit(xtrain_selected, ytrain)

In [None]:
#y_pred = xgb_model.predict(xtest_selected)

In [None]:
# accuracy = accuracy_score(ytest, y_pred)
# print("Accuracy:", accuracy)

In [None]:
# Defining the hyperparameters grid
param_grid = {
    'eta': [0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 4, 5]
}

# # Performing grid search
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=2, scoring='accuracy',verbose=2)
# grid_search.fit(xtrain_selected, ytrain)

# # # Getting the best hyperparameters
# best_params = grid_search.best_params_
# print("Best hyperparameters:", best_params)

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Defining your XGBoost model with the chosen hyperparameters
xgb_model = xgb.XGBClassifier(eta=0.4, max_depth=3, min_child_weight=1, reg_alpha= 0.5, reg_lambda= 0.5)

# # Performing cross-validation
# cv_scores = cross_val_score(xgb_model, xtrain_selected, ytrain, cv=5, scoring='accuracy')

# # Printing the cross-validation scores
# print("Cross-validation scores:", cv_scores)

# # Calculating and printing the mean and standard deviation of the cross-validation scores
# print("Mean CV score:", np.mean(cv_scores))
# print("Standard deviation of CV scores:", np.std(cv_scores))

In [None]:
# Defining the XGBoost model with the best hyperparameters from grid search
xgb_model_reg = xgb.XGBClassifier(eta=0.4, max_depth=3, min_child_weight=1)

# Defining the hyperparameters grid for regularization
param_grid_reg = {
    'reg_alpha': [0.1, 0.5, 1.0],
    'reg_lambda': [0.1, 0.5, 1.0]
}

# # Performing grid search for regularization
# grid_search_reg = GridSearchCV(estimator=xgb_model_reg, param_grid=param_grid_reg, cv=5, scoring='accuracy')
# grid_search_reg.fit(xtrain_selected, ytrain)

# # Getting the best regularization parameters
# best_params_reg = grid_search_reg.best_params_
# print("Best regularization parameters:", best_params_reg)

In [None]:
best_xgb_model = xgb.XGBClassifier(random_state=42, eta=0.4, max_depth=3, min_child_weight=1, reg_alpha= 0.5, reg_lambda= 0.5)

In [None]:
best_xgb_model.fit(xtrain_selected, ytrain)

In [None]:
y_pred = best_xgb_model.predict(xtest_selected)

In [None]:
accuracy = accuracy_score(ytest, y_pred)
print("Accuracy:", accuracy)

## Model 4 : Using Lightgbm

In [None]:
import lightgbm as lgb

In [None]:
lgb_model = lgb.LGBMClassifier(n_estimators=100)

In [None]:
# lgb_model.fit(xtrain_selected, ytrain)

In [None]:
# y_pred = lgb_model.predict(xtest_selected)

In [None]:
# accuracy = accuracy_score(ytest, y_pred)
# print("Accuracy:", accuracy)

In [None]:
param_dist = {
    'objective': ['binary'],
    'boosting': ['gbdt', 'dart'],
    'num_leaves': randint(20, 100),
    'learning_rate': uniform(0.01, 0.5),
    'feature_fraction': uniform(0.5, 0.5),
    'bagging_fraction': uniform(0.5, 0.5),
    'bagging_freq': randint(1, 10),
    'max_depth': randint(5, 20),
    'min_child_samples': randint(5, 100),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(0, 2),
    'scale_pos_weight': uniform(1, 10)
}

In [None]:
random_search = RandomizedSearchCV(estimator=lgb_model, param_distributions=param_dist, n_iter=50,
                                   scoring='accuracy', cv=5, random_state=42, n_jobs=-1)

In [None]:
# random_search.fit(xtrain_selected, ytrain)

In [None]:
# print("Best parameters found: ", random_search.best_params_)
# print("Best accuracy score: ", random_search.best_score_)

<em><strong>Parameteres and accuracy score for different randomized search of lgmclassifier model</em></strong>

1. Best parameters found:  {'bagging_fraction': 0.542673732496884, 'bagging_freq': 4, 'boosting': 'dart', 'feature_fraction': 0.7703175608050532, 'learning_rate': 0.3287149507491033, 'max_depth': 8, 'min_child_samples': 20, 'num_leaves': 32, 'objective': 'binary', 'reg_alpha': 1.939073734228318, 'reg_lambda': 1.4291902083599042, 'scale_pos_weight': 1.410675167678758}
<br>
 Best accuracy score:  0.7695269526952696
<br>
2. Best parameters found:  {'bagging_fraction': 0.9576068638132402, 'bagging_freq': 2, 'boosting': 'dart', 'feature_fraction': 0.7746133323530602, 'learning_rate': 0.3672979613500312, 'max_depth': 12, 'min_child_samples': 5, 'num_leaves': 35, 'objective': 'binary', 'reg_alpha': 1.0333925148531333, 'reg_lambda': 1.3142226570003337, 'scale_pos_weight': 5.3567289867789105}
 <br>
Best accuracy score:  0.7704437110377703

3. Best parameters found:  {'bagging_fraction': 0.9143687545759647, 'bagging_freq': 9, 'boosting': 'dart', 'feature_fraction': 0.8035171238433423, 'learning_rate': 0.1479995910112717, 'max_depth': 17, 'min_child_samples': 19, 'num_leaves': 64, 'objective': 'binary', 'reg_alpha': 0.14910128735954165, 'reg_lambda': 1.9737738732010346, 'scale_pos_weight': 8.722447692966574}
<br>
Best accuracy score:  0.7718193665734946

In [None]:
best_lgm = lgb.LGBMClassifier(n_estimators=100, bagging_fraction= 0.9143687545759647, bagging_freq=9, boosting='dart', feature_fraction=0.8035171238433423, learning_rate=0.1479995910112717, max_depth=17, min_child_samples=19, num_leaves=64, objective='binary', reg_alpha=0.14910128735954165, reg_lambda=1.9737738732010346, scale_pos_weight=8.722447692966574)

In [None]:
# best_lgm.fit(xtrain_selected, ytrain)

In [None]:
# best_lgm.score(xtest_selected, ytest)

## Model 5 : Using SVC

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
# svc.fit(xtrain_selected, ytrain)

In [None]:
# svc.score(xtest_selected, ytest)

In [None]:
# Using for loop to try different parameters values for hypertuning because
# grid search or random search was taking too much time

In [None]:
# for c in [0.1, 1, 10, 100]:
#     svc = SVC(C=c)
#     svc.fit(xtrain_selected, ytrain)
#     y_pred = svc.predict(xtest_selected)
#     accuracy = accuracy_score(ytest, y_pred)
#     print("Accuracy:",accuracy)

Accuracy at c=0.1 is 0.7554985337243402<br>
Accuracy at c=1 is 0.7602639296187683<br>
Accuracy at c=10 is 0.7690615835777126<br>
Accuracy at c=100 is 0.7694281524926686<br>

In [None]:
# for g in [0.1, 0.01, 0.001, 0.001]:
#     svc = SVC(C=100, gamma=g)
#     svc.fit(xtrain_selected, ytrain)
#     y_pred = svc.predict(xtest_selected)
#     accuracy = accuracy_score(ytest, y_pred)
#     print("Accuracy:",accuracy)

Accuracy at gamma = 0.1: 0.7144428152492669 <br>
Accuracy at gamma = 0.001: 0.7697947214076246 <br>
Accuracy at gamma=0.0001: 0.7554985337243402 <br>

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Defining your XGBoost model with the chosen hyperparameters
svc_model = SVC(C=200, gamma=0.01)

# # # Performing cross-validation
# cv_scores = cross_val_score(svc_model, xtrain_selected, ytrain, cv=5, scoring='accuracy')

# # # Printing the cross-validation scores
# print("Cross-validation scores:", cv_scores)

# # # Calculating and printing the mean and standard deviation of the cross-validation scores
# print("Mean CV score:", np.mean(cv_scores))
# print("Standard deviation of CV scores:", np.std(cv_scores))

In [None]:
best_svc = SVC(C=100, gamma=0.01)
# best_svc.fit(xtrain_selected, ytrain)

In [None]:
# y_pred = best_svc.predict(xtest_selected)

In [None]:
# accuracy_score(ytest, y_pred)

## Analysing the prediction using confusion matrix and classification table

In [None]:
# I am choosing XGBoost 'models prediction of target variable for analysing
# the difference in output through confusion matrix and classification table
# because this model performed better than other of my models.

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Calculating confusion matrix
conf_matrix = confusion_matrix(ytest, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Printing classification report
class_report = classification_report(ytest, y_pred)
print("Classification Report:")
print(class_report)

In [None]:
# The recall score of every classes as we can see is very bad, specially for class 0 , 3 and 4
# the model is predicting many examples to be of class 5 when its actually of class 0
# the recall score of class 5 is very good

## Preprocessing of test data to predict target values

In [None]:
test.shape

In [None]:
test = test.drop(columns = ['RecipeName', 'CommentID', 'UserID', 'UserName', 'CreationTimestamp'])

In [None]:
test_text = test['Recipe_Review']
test_numerical = test.drop(columns = ['Recipe_Review'])

In [None]:
test_numerical = test_numerical.drop(columns = ['ThumbsUpCount', 'ThumbsDownCount'])

In [None]:
test_numerical = test_numerical.drop(columns = ['RecipeCode'])

In [None]:
# converting to lower case
test_text = test_text.str.lower()

In [None]:
# Remove HTML entities
test_text = test_text.apply(lambda text: regex.sub(r'&#[0-9]+;', '', text))

In [None]:
# Remove non-alphanumeric characters
test_text = test_text.apply(lambda text: regex.sub(r'[^a-zA-Z0-9\s]', '', text))

In [None]:
# Tokenization
test_text = test_text.apply(lambda text: text.split())

In [None]:
# Removing Stopwords
test_text = test_text.apply(lambda tokens: [word for word in tokens if word not in ENGLISH_STOP_WORDS])

In [None]:
# Joining Tokens Back into Text
test_text = test_text.apply(lambda tokens: ' '.join(tokens))

In [None]:
# Scaling Numerical Features
test_numerical = scaler.transform(test_numerical)

In [None]:
test_vectorized = vectorizer.transform(test_text).toarray()

In [None]:
test_com = np.hstack([test_numerical, test_vectorized])

In [None]:
test_selected = selector.transform(test_vectorized)

In [None]:
test_vectorized.shape, xtest_vectorized.shape, xtrain_vectorized.shape

In [None]:
predictions_xgb = best_xgb_model.predict(test_selected)

In [None]:
submission = pd.DataFrame(columns = ['ID', 'Rating'])
submission['ID'] = [i for i in range(1, len(predictions_xgb)+1)]
submission['Rating'] = predictions_xgb
submission.to_csv('submission.csv', index=False)