# Data Cleansing and Preparation:
## The following data cleansing and preparation steps were followed by the work in the source below:
### https://www.kaggle.com/code/ahmedislam0/phishing-url-detection-96-accuracy
## The Support Vector Regression Machine model implementation, analysis, and evaluation are unique to this notebook.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,recall_score,precision_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pickle
pd.set_option('display.max_columns', None)

In [3]:
# Import Dataset
df = pd.read_csv('../../../data/dataset_phishing.csv')
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,nb_underscore,nb_tilde,nb_percent,nb_slash,nb_star,nb_colon,nb_comma,nb_semicolumn,nb_dollar,nb_space,nb_www,nb_com,nb_dslash,http_in_path,https_token,ratio_digits_url,ratio_digits_host,punycode,port,tld_in_path,tld_in_subdomain,abnormal_subdomain,nb_subdomains,prefix_suffix,random_domain,shortening_service,path_extension,nb_redirection,nb_external_redirection,length_words_raw,char_repeat,shortest_words_raw,shortest_word_host,shortest_word_path,longest_words_raw,longest_word_host,longest_word_path,avg_words_raw,avg_word_host,avg_word_path,phish_hints,domain_in_brand,brand_in_subdomain,brand_in_path,suspecious_tld,statistical_report,nb_hyperlinks,ratio_intHyperlinks,ratio_extHyperlinks,ratio_nullHyperlinks,nb_extCSS,ratio_intRedirection,ratio_extRedirection,ratio_intErrors,ratio_extErrors,login_form,external_favicon,links_in_tags,submit_email,ratio_intMedia,ratio_extMedia,sfh,iframe,popup_window,safe_anchor,onmouseover,right_clic,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,1,0,0,0,1,0.0,0.0,0,0,0,0,0,3,0,0,0,0,0,0,4,4,3,3,3,11,11,6,5.75,7.0,4.5,0,0,0,0,0,0,17,0.529412,0.470588,0,0,0,0.875,0,0.5,0,0,80.0,0,100.0,0.0,0,0,0,0.0,0,0,0,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,0,0,0,0,5,0,1,0,0,0,0,0,0,0,0,1,0.220779,0.0,0,0,0,0,0,1,0,0,0,0,1,0,4,4,2,19,2,32,19,32,15.75,19.0,14.666667,0,0,0,0,0,0,30,0.966667,0.033333,0,0,0,0.0,0,0.0,0,0,100.0,0,80.0,20.0,0,0,0,100.0,0,0,0,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,3,2,0,0,5,0,1,0,0,0,0,0,1,0,0,0,0.150794,0.0,0,0,0,1,0,3,1,0,0,0,1,0,12,2,2,3,2,17,13,17,8.25,8.4,8.142857,0,0,0,0,0,0,4,1.0,0.0,0,0,0,0.0,0,0.0,0,0,100.0,0,0.0,0.0,0,0,0,100.0,0,0,0,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,1,0.0,0.0,0,0,0,0,0,2,0,0,0,0,1,0,1,0,5,5,0,5,5,0,5.0,5.0,0.0,0,0,0,0,0,0,149,0.973154,0.026846,0,0,0,0.25,0,0.25,0,0,100.0,0,96.428571,3.571429,0,0,0,62.5,0,0,0,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,0,0,0,0,5,0,1,0,0,0,0,1,0,0,0,1,0.0,0.0,0,0,0,0,0,2,0,0,0,0,1,0,6,3,3,3,4,11,7,11,6.333333,5.0,7.0,0,0,0,0,0,0,102,0.470588,0.529412,0,0,0,0.537037,0,0.018519,1,0,76.470588,0,0.0,100.0,0,0,0,0.0,0,0,0,0,1,0,224,8175,8725,0,0,6,legitimate


In [4]:
# How many of these columns have null columns? If null we need to remove them.

In [5]:
df.isna().sum()

url                0
length_url         0
length_hostname    0
ip                 0
nb_dots            0
                  ..
web_traffic        0
dns_record         0
google_index       0
page_rank          0
status             0
Length: 89, dtype: int64

In [6]:
# removing missing values

df.dropna(inplace=True)

In [8]:
# Set up an array of the features "the columns" we want to include in our study.

In [14]:
features = [
    'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq',
    'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn',
    'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
    'ratio_digits_host', 'punycode', 'shortening_service', 'path_extension', 'phish_hints', 'domain_in_brand',
    'brand_in_subdomain', 'brand_in_path', 'suspecious_tld'
]

In [16]:
# Target feature mapping, meaning mapping our known answers to a numeric value.

df['status'] = df['status'].map({'phishing': 1, 'legitimate': 0})

In [18]:
# Verifying that we have a perfectly balanced data set for training, 5715 phishing and legitimate entries. 

In [20]:
df['status'].value_counts()

status
0    5715
1    5715
Name: count, dtype: int64

In [22]:
# Gives general information about the dataset

In [24]:
df.describe()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,nb_underscore,nb_tilde,nb_percent,nb_slash,nb_star,nb_colon,nb_comma,nb_semicolumn,nb_dollar,nb_space,nb_www,nb_com,nb_dslash,http_in_path,https_token,ratio_digits_url,ratio_digits_host,punycode,port,tld_in_path,tld_in_subdomain,abnormal_subdomain,nb_subdomains,prefix_suffix,random_domain,shortening_service,path_extension,nb_redirection,nb_external_redirection,length_words_raw,char_repeat,shortest_words_raw,shortest_word_host,shortest_word_path,longest_words_raw,longest_word_host,longest_word_path,avg_words_raw,avg_word_host,avg_word_path,phish_hints,domain_in_brand,brand_in_subdomain,brand_in_path,suspecious_tld,statistical_report,nb_hyperlinks,ratio_intHyperlinks,ratio_extHyperlinks,ratio_nullHyperlinks,nb_extCSS,ratio_intRedirection,ratio_extRedirection,ratio_intErrors,ratio_extErrors,login_form,external_favicon,links_in_tags,submit_email,ratio_intMedia,ratio_extMedia,sfh,iframe,popup_window,safe_anchor,onmouseover,right_clic,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
count,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0
mean,61.126684,21.090289,0.150569,2.480752,0.99755,0.022222,0.141207,0.162292,0.0,0.293176,0.32266,0.006649,0.123097,4.289589,0.0007,1.027909,0.004024,0.062292,0.001925,0.034821,0.448469,0.127997,0.006562,0.01671,0.610936,0.053137,0.025024,0.00035,0.002362,0.065617,0.050131,0.02161,2.231671,0.20245,0.08329,0.123447,0.000175,0.49825,0.00315,6.232808,2.927472,3.127297,5.019773,2.39895,15.393876,10.467979,10.561505,7.258882,7.678075,5.092425,0.327734,0.104199,0.004112,0.004899,0.017935,0.059755,87.189764,0.602457,0.27672,0.0,0.784864,0.0,0.158926,0.0,0.062469,0.063605,0.44217,51.978211,0.0,42.870444,23.236293,0.0,0.001312,0.006037,37.063922,0.001137,0.0014,0.124759,0.775853,0.439545,0.072878,492.532196,4062.543745,856756.6,0.020122,0.533946,3.185739,0.5
std,55.297318,10.777171,0.357644,1.369686,2.087087,0.1555,0.364456,0.821337,0.0,0.998317,1.093336,0.081274,1.46645,1.882251,0.026448,0.240325,0.10324,0.59819,0.077111,0.375576,0.501912,0.379008,0.080742,0.169358,0.487559,0.089363,0.093422,0.018705,0.048547,0.247622,0.218225,0.145412,0.637069,0.401843,0.276332,0.328964,0.013227,0.691907,0.056035,5.572355,4.768936,2.211571,3.94158,2.997809,22.083644,4.932015,23.077883,4.145827,3.578435,7.14705,0.8426,0.305533,0.063996,0.069827,0.132722,0.331266,166.758254,0.376474,0.319958,0.0,2.758802,0.0,0.266437,0.0,0.156209,0.244058,0.496666,41.523144,0.0,46.249897,38.386577,0.0,0.036204,0.077465,39.073385,0.033707,0.03739,0.33046,0.417038,0.496353,0.259948,814.769415,3107.7846,1995606.0,0.140425,0.498868,2.536955,0.500022
min,12.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-12.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,15.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,3.0,0.0,9.0,7.0,0.0,5.25,5.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.224991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,84.0,972.25,0.0,0.0,0.0,1.0,0.0
50%,47.0,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,3.0,3.0,2.0,11.0,10.0,7.0,6.5,7.0,4.857143,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0.743442,0.131148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,11.111111,0.0,0.0,0.0,0.0,23.294574,0.0,0.0,0.0,1.0,0.0,0.0,242.0,3993.0,1651.0,0.0,1.0,3.0,0.5
75%,71.0,24.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.079365,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,4.0,3.0,6.0,3.0,16.0,13.0,11.0,8.0,9.0,6.714286,0.0,0.0,0.0,0.0,0.0,0.0,101.0,0.944767,0.47484,0.0,1.0,0.0,0.230769,0.0,0.034483,0.0,1.0,98.061004,0.0,100.0,33.333333,0.0,0.0,0.0,75.0,0.0,0.0,0.0,1.0,1.0,0.0,449.0,7026.75,373845.5,0.0,1.0,5.0,1.0
max,1641.0,214.0,1.0,24.0,43.0,4.0,3.0,19.0,0.0,19.0,18.0,1.0,96.0,33.0,1.0,7.0,4.0,20.0,6.0,18.0,2.0,6.0,1.0,4.0,1.0,0.723881,0.8,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,6.0,1.0,106.0,146.0,31.0,39.0,40.0,829.0,62.0,829.0,128.25,39.0,250.0,10.0,1.0,1.0,1.0,1.0,2.0,4659.0,1.0,1.0,0.0,124.0,0.0,2.0,0.0,1.0,1.0,1.0,100.0,0.0,100.0,100.0,0.0,1.0,1.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,29829.0,12874.0,10767990.0,1.0,1.0,10.0,1.0


In [26]:
# Gives number of rows, number of columns

In [28]:
df.shape

(11430, 89)

In [30]:
# Gives information about the datatypes for the features, non-null values

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11430 non-null  object 
 1   length_url                  11430 non-null  int64  
 2   length_hostname             11430 non-null  int64  
 3   ip                          11430 non-null  int64  
 4   nb_dots                     11430 non-null  int64  
 5   nb_hyphens                  11430 non-null  int64  
 6   nb_at                       11430 non-null  int64  
 7   nb_qm                       11430 non-null  int64  
 8   nb_and                      11430 non-null  int64  
 9   nb_or                       11430 non-null  int64  
 10  nb_eq                       11430 non-null  int64  
 11  nb_underscore               11430 non-null  int64  
 12  nb_tilde                    11430 non-null  int64  
 13  nb_percent                  114

In [34]:
# Select only the numerical columns from the dataframe that are float64 or int64 to rull out objects.
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix on the numerical columns.
corr_matrix = numerical_df.corr()

In [36]:
status_corr = corr_matrix['status']
status_corr.shape

(88,)

In [38]:
# Function for selecting features that are above than threshold value

def feature_selector_correlation(cmatrix, threshold):

    selected_features = []
    feature_score = []
    i=0
    for score in cmatrix:
        if abs(score)>threshold:
            selected_features.append(cmatrix.index[i])
            feature_score.append( ['{:3f}'.format(score)])
        i+=1
    result = list(zip(selected_features,feature_score))
    return result

In [40]:
features_selected = feature_selector_correlation(status_corr, 0.2)
features_selected

[('length_url', ['0.248580']),
 ('length_hostname', ['0.238322']),
 ('ip', ['0.321698']),
 ('nb_dots', ['0.207029']),
 ('nb_qm', ['0.294319']),
 ('nb_eq', ['0.233386']),
 ('nb_slash', ['0.242270']),
 ('nb_www', ['-0.443468']),
 ('ratio_digits_url', ['0.356395']),
 ('ratio_digits_host', ['0.224335']),
 ('tld_in_subdomain', ['0.208884']),
 ('prefix_suffix', ['0.214681']),
 ('shortest_word_host', ['0.223084']),
 ('longest_words_raw', ['0.200147']),
 ('longest_word_path', ['0.212709']),
 ('phish_hints', ['0.335393']),
 ('nb_hyperlinks', ['-0.342628']),
 ('ratio_intHyperlinks', ['-0.243982']),
 ('empty_title', ['0.207043']),
 ('domain_in_title', ['0.342807']),
 ('domain_age', ['-0.331889']),
 ('google_index', ['0.731171']),
 ('page_rank', ['-0.511137']),
 ('status', ['1.000000'])]

In [42]:
selected_features = []
for feature, score in features_selected:
    if feature != 'status':
        selected_features.append(feature)

In [44]:
selected_features

['length_url',
 'length_hostname',
 'ip',
 'nb_dots',
 'nb_qm',
 'nb_eq',
 'nb_slash',
 'nb_www',
 'ratio_digits_url',
 'ratio_digits_host',
 'tld_in_subdomain',
 'prefix_suffix',
 'shortest_word_host',
 'longest_words_raw',
 'longest_word_path',
 'phish_hints',
 'nb_hyperlinks',
 'ratio_intHyperlinks',
 'empty_title',
 'domain_in_title',
 'domain_age',
 'google_index',
 'page_rank']

In [46]:
X = df[selected_features]
y = df['status']

## This next code block performs 10-fold cross-validation on the dataset using various configurations of 
Support Vector Regression (SVR) models with standardized input features.

1. **Standard Scaling**: The feature matrix `X` is scaled using `StandardScaler`, which standardizes the data by 
   removing the mean and scaling it to unit variance. This ensures that features with different scales don't negatively 
   affect the performance of the SVR models.

2. **SVR Configurations**: Three different Support Vector Regression models are tested with different kernel functions: 
   - **Linear Kernel**: Captures linear relationships between features and the target.
   - **RBF Kernel**: A more complex kernel that can capture non-linear patterns in the data by mapping features into 
     higher-dimensional spaces.
   - **Polynomial Kernel**: Captures non-linear interactions between features using polynomial functions.

3. **10-fold Cross-Validation**: The `cross_val_score` function performs 10-fold cross-validation, where the dataset 
   is split into 10 subsets. Each SVR model is trained on 9 subsets and evaluated on the remaining subset, 
   repeating this process across all 10 folds. This helps provide a robust measure of model performance and ensures 
   that the results are not overly dependent on a particular split of the data.

4. **Evaluation Metrics**: For each SVR model, the **Mean Squared Error (MSE)** for each fold and the average MSE across all folds 
   are calculated and displayed. MSE measures the average squared difference between the predicted and actual values, 
   providing insight into how well the model fits the data. The lower the MSE, the better the model's performance.

By scaling the data and applying cross-validation, this approach ensures a thorough evaluation of the SVR models, 
allowing the comparison of various kernel configurations to determine which one performs best for the regression task.

In [48]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import numpy as np

# Assuming X is your feature matrix and y is your target variable (continuous values for regression)
# X = df[selected_features]
# y = df['target_variable']

# Initialize the StandardScaler and scale the entire feature matrix X
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define different SVR models for regression tasks
#models = [
#    ('Linear SVR', SVR(kernel='linear', C=1)),
#    ('RBF SVR', SVR(kernel='rbf', C=1, gamma=0.1)),
#    ('Polynomial SVR', SVR(kernel='poly', degree=3, C=1, gamma=0.1))
]

# Function to evaluate SVR models using cross-validation for regression
#def evaluate_regression_model(model, X, y, model_name):
#    print(f"Evaluating {model_name} (Regression)...")
    
    # Perform 10-fold cross-validation for regression
#    scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
    
    # Print cross-validation results
#    print(f"Mean Squared Error for each fold: {-scores}")
#    print(f"Mean MSE: {-np.mean(scores)}\n")

# Iterate through each SVR model and evaluate using Mean Squared Error
#for model_name, model in models:
#    evaluate_regression_model(model, X_scaled, y, model_name)

Evaluating Linear SVR (Regression)...
Mean Squared Error for each fold: [0.08057713 0.0970103  0.08890391 0.11476004 0.09522179 0.08236009
 0.09227233 0.10726277 0.08900387 0.09991692]
Mean MSE: 0.09472891500456572

Evaluating RBF SVR (Regression)...
Mean Squared Error for each fold: [0.03366636 0.04174458 0.03246224 0.04136156 0.03639699 0.03664976
 0.03507766 0.03799333 0.03100592 0.03713072]
Mean MSE: 0.03634891147769166

Evaluating Polynomial SVR (Regression)...
Mean Squared Error for each fold: [ 0.22356538  0.14846463  0.07776988  0.23715208  0.12105632  0.75250931
  0.59332727 46.54879139  0.08521166  0.12293362]
Mean MSE: 4.891078153530154



## Saved Results in a markdown block because it takes a while to run

Evaluating Linear SVR (Regression)...
Mean Squared Error for each fold: [0.08057713 0.0970103  0.08890391 0.11476004 0.09522179 0.08236009
 0.09227233 0.10726277 0.08900387 0.09991692]
Mean MSE: 0.09472891500456572

Evaluating RBF SVR (Regression)...
Mean Squared Error for each fold: [0.03366636 0.04174458 0.03246224 0.04136156 0.03639699 0.03664976
 0.03507766 0.03799333 0.03100592 0.03713072]
Mean MSE: 0.03634891147769166

Evaluating Polynomial SVR (Regression)...
Mean Squared Error for each fold: [ 0.22356538  0.14846463  0.07776988  0.23715208  0.12105632  0.75250931
  0.59332727 46.54879139  0.08521166  0.12293362]
Mean MSE: 4.891078153530154

# Cross-Validation and Regression Metrics Analysis
This code performs 10-fold cross-validation on a dataset using various configurations of Support Vector Regression (SVR) models, evaluating their performance with Mean Squared Error (MSE) and R² scores.

### 1. SVR Configurations:
- **Linear Kernel SVR**: Assumes a linear relationship between features and the target variable.
- **RBF Kernel SVR**: Uses the Radial Basis Function to capture non-linear relationships, mapping features into a higher-dimensional space.
- **Polynomial Kernel SVR**: Uses polynomial relationships between features and the target, allowing the model to capture more complex feature interactions.

### 2. Cross-Validation:
- The `cross_val_predict` function is used to perform **10-fold cross-validation**, where the dataset is split into 10 subsets. 
  The model is trained on 9 subsets and evaluated on the remaining subset, repeating this process across all 10 folds.
- This process helps ensure that the evaluation is robust, providing insights into the model's generalization performance across different splits of the data.

### 3. Evaluation Metrics:
- **Mean Squared Error (MSE)**: Measures the average squared difference between the predicted and actual values. A lower MSE indicates better performance, as it means the predictions are closer to the true values.
- **R² Score**: Measures how well the model captures the variance in the data. A score closer to 1.0 indicates that the model explains most of the variance, while a lower score means the model has not fit the data well.

By applying 10-fold cross-validation and calculating MSE and R² scores, this code evaluates how well each SVR model performs on regression tasks, allowing for comparison between different kernel functions (linear, RBF, and polynomial).

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
import numpy as np

# Assuming X is your feature matrix and y is your target variable (continuous values for regression)
# X = df[selected_features]
# y = df['target_variable']

# Define different SVR models for regression tasks
models = [
    ('1. Support Vector Regression (Linear)', SVR(kernel='linear', C=1)),
    ('2. Support Vector Regression (RBF)', SVR(kernel='rbf', C=1, gamma=0.1)),
    ('3. Support Vector Regression (Polynomial)', SVR(kernel='poly', degree=3, C=1, gamma=0.1))
]

# Function to evaluate the SVR model using cross-validation for regression
def evaluate_model_cv_regression(model, X, y, model_name):
    # Get predictions using cross-validation for regression
    y_pred = cross_val_predict(model, X, y, cv=10, n_jobs=-1)
    
    # Calculate Mean Squared Error and R2 Score
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    # Print evaluation results
    print(f'============================== {model_name} Cross-Validation Evaluation (Regression) ==============================')
    print(f'Mean Squared Error: {mse:.4f}')
    print(f'R² Score: {r2:.4f}')
    print()

# Iterate through each SVR model and evaluate using cross-validation
for model_name, model in models:
    evaluate_model_cv_regression(model, X, y, model_name)