In [50]:
#importing libraries
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso 
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error, r2_score 
from scipy import stats 
#To access functions stored in utilss
%run utilss.ipynb

## Ridge and Lasso regression

In [85]:
#calling the function to filter the data by excluding the specified observation
df_filtered = load_filter_data('new_cso_data.csv', 'All types of accommodation')

In [86]:
#Identifying outliers
outliers = calculate_iqr(df_filtered, 'Average Length of Stay of Foreign Visitors (Nights per trip)')

In [87]:
#Processes data to remove outliers 
df_ml = load_and_process_data(df_filtered, outliers)


In [88]:
#To split features and target attribute
X, y = split_features_target(df_ml, 'Average Length of Stay of Foreign Visitors (Nights per trip)')

In [90]:
#Encoding to categorical variables
categorical_feat= ['Main Accommodation Type']
numerical_feat= X.select_dtypes(include=[np.number]).columns

In [91]:
#To transform the diffrent attributes
preprocessor = ColumnTransformer( transformers=[ 
    ('num', StandardScaler(), numerical_feat), 
    ('cat', OneHotEncoder(), categorical_feat)]) 


In [92]:
#Applies the transformations
X_processed = preprocessor.fit_transform(X)

In [93]:
#splits data into training and testing sets
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=42) #20% data for testing, 80% for training 

In [96]:
#To train and evaluate Ridge model
best_ridge = train_ridge_model(X_train, y_train)
ridge_mse, ridge_r2 = evaluate_model(best_ridge, X_test, y_test)
print(f"Ridge Model - MSE: {ridge_mse}, R2: {ridge_r2}")

Ridge Model - MSE: 1.7117140720389596, R2: 0.897587768116282


In [97]:
#To train and evaluate Lasso model
best_lasso = train_lasso_model(X_train, y_train)
lasso_mse, lasso_r2 = evaluate_model(best_lasso, X_test, y_test)
print(f"Lasso Model - MSE: {lasso_mse}, R2: {lasso_r2}")

Lasso Model - MSE: 1.6837773667498281, R2: 0.8992592273785931


In [98]:
#Analyzes coefficients
analyze_coefficients(best_ridge, "Ridge")
analyze_coefficients(best_lasso, "Lasso")

Ridge Coefficients: [-0.04074648 -2.0206086  -2.3025272   2.24934288  1.50632319 -0.19573366
 -1.5315263  -2.69724308 -0.70598195  3.53689737  1.39785395]
Ridge Intercept: 9.281732624003398
Lasso Coefficients: [-0.01461139 -2.14193838 -2.28196511  2.35111239  1.79037654 -0.17896436
 -1.29983771 -1.75338961 -0.          4.69174639  2.41756169]
Lasso Intercept: 8.502071316289245
