In [5]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [6]:
#Function to load and filter the dataset
def load_filter_data(file_path, exclude_type):
    """
    Loads data from a CSV file and filters the accommodation types.

    Parameters:
    file_path (str): CSV file path.

    Returns:
    DataFrame: Filtered DataFrame.
    """
    
    #Reading dataset with index set to false
    df = pd.read_csv(file_path, index_col=False)

    #To filter the dataset excluding the observation specified
    df_filtered= df[df['Main Accommodation Type'] != exclude_type].copy()

    #To reindex the dataFrame
    df_filtered.reset_index(drop=True, inplace=True)
    
    #To filter the dataset ecluding the observation specified
    return df_filtered

In [7]:
#function to calculate the interquartile range and identify outliers
def calculate_iqr(df_filtered, feature):
    """
    Calculates interquartiles, IQR, and outliers.
    
    PArameters:
    df_filtered - pandas DataFrame.
    feature - feature where the IQR and outliers are going to be calculated.
    
    Returns:
    outliers - DataFrame showing outliers.
    """

    constant = 1.5
    #To calculate the 25th percentile (first quartile)
    Q1 = df_filtered[feature].quantile(0.25)
    #To calculate the 75th percentile (third quartile)
    Q3 = df_filtered[feature].quantile(0.75)
    IQR = Q3 - Q1

    #To identify outliers
    
    #Calculates the lower bound
    lower = Q1 - constant * IQR
    #Calculates the upper bound
    upper = Q3 + constant * IQR
    
    outliers = df_filtered[(df_filtered[feature] < lower) | (df_filtered[feature] > upper)]
    
    # Returns features specified only
    return outliers[['Average Length of Stay of Foreign Visitors (Nights per trip)']]


In [8]:
#Function to load the dataFrame processed
def load_and_process_data(df_filtered, outliers):
    """
    Processes the DataFrame by removing outliers.
    
    Parameters:
    df_filtered (pd.DataFrame): DataFrame after initial filtering.
    outliers (pd.DataFrame): DataFrame containing outliers.

    Returns:
    pd.DataFrame: DataFrame without outliers.
    """
    df_no_outliers = df_filtered[~df_filtered.index.isin(outliers.index)]
    return df_no_outliers


## Graphics

In [9]:
#Function to plot a heatmap
def plot_correlation_heatmap(dataframe, variables, title='Correlation Matrix Heatmap'):
    """
    Plots a heatmap of the correlation matrix for the specified variables in the given DataFrame.

    Parameters:
    - dataframe: pd.DataFrame - The DataFrame containing the data.
    - variables: list - A list of column names to include in the correlation matrix.
    - title: str - The title for the heatmap.
    """
    correlation_matrix = dataframe[variables].corr()

    plt.figure(figsize=(10, 6))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
    plt.title(title)
    plt.show()

In [10]:
def plot_poisson_distribution(df_no_outliers, accommodation_type_column, accommodation_type):
    """
    Generates and plots a Poisson distribution for a specific accommodation type.
    
    Parameters:
    df (DataFrame): The dataframe containing the data.
    accommodation_type_column (str): The name of the column indicating the accommodation type.
    accommodation_type (str): The specific accommodation type for the calculation.
    """
    # Filter the dataset for the specific accommodation type
    filtered_data = df_no_outliers[df_no_outliers[accommodation_type_column] == accommodation_type]
    
    # Check if there is data after filtering
    if filtered_data.empty:
        print(f"No data found for accommodation type: {accommodation_type}")
        return
    
    # Calculate λ as the average nights per month for this accommodation type
    lambda_poisson = filtered_data['Month'].mean()
    
    # Generate the Poisson distribution
    np.random.seed(33)  # Set seed for reproducibility
    poisson_dist = np.random.poisson(lambda_poisson, 10000)
    
    # Plot the distribution
    plt.hist(poisson_dist, bins=30, density=True, alpha=0.6, color='b')
    plt.axvline(x=np.mean(poisson_dist), color='r', linestyle='dashed', linewidth=2)
    plt.text(np.mean(poisson_dist) + 2, plt.ylim()[1] * 0.8, f'Mean: {np.mean(poisson_dist):.2f}', color='r')
    plt.title(f'Poisson Distribution of Monthly Nights for {accommodation_type} Accommodation')
    plt.xlabel('Number of Nights')
    plt.ylabel('Probability Density')
    plt.show()

## Statistics

In [12]:
def log_transform_dataframe(df_no_outliers, columns_to_transform):
    """
    Applies logarithmic transformation to specified columns of the DataFrame.

    Parameters:
    df_no_outliers: The DataFrame to transform.
    columns_to_transform (list): List of column names to apply the transformation.

    Returns:
    A new DataFrame with transformed columns added.
    """
    #Creates a copy of the original DataFrame to avoid modifying it
    transformed_df = df_no_outliers.copy()

    #Defines a function to apply logarithmic transformation
    def log_transform(column):
        return np.log(column + 1)  #to avoid log(0)

    #Applies the transformation to the specified columns
    for col in columns_to_transform:
        transformed_df[f'log_{col}'] = log_transform(transformed_df[col])

    return transformed_df

## Machine Learning

### Regression

In [13]:
#Function to divide dependant and independant variables
def split_features_target(df, target):
    """
    Splits the DataFrame into features and target.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target (str): The name of the target column.

    Returns:
    X (pd.DataFrame): The DataFrame containing the features.
    y (pd.Series): The Series containing the target variable.
    """
    if target not in df.columns:
        raise ValueError(f"Target column '{target}' not found in DataFrame.")

    y = df[target]
    X = df.drop(columns=[target], axis=1)

    return X, y

In [14]:
#Function to devide data in train and test
def split_data(X, y, test_size=0.2, random_state=42):
    """
    Splits data into training and test sets.

    Parameters:
    X (pd.DataFrame): Features DataFrame.
    y (pd.Series): Target Series.
    test_size (float): Proportion of the dataset to include in the test split.
    random_state (int): Controls the shuffling applied to the data before applying the split.

    Returns:
    X_train, X_test, y_train, y_test: Split datasets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


In [15]:
#Function to standarize the data
def standardize_data(X_train, X_test):
    """
    Standardizes the training and test datasets.

    Parameters:
    X_train (pd.DataFrame): Training features.
    X_test (pd.DataFrame): Test features.

    Returns:
    X_train_scaled (np.ndarray): Scaled training features.
    X_test_scaled (np.ndarray): Scaled test features.
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

In [16]:
def train_ridge_model(X_train, y_train):
    """
    Trains a Ridge regression model using GridSearchCV to find the best hyperparameters.

    Parameters:
    X_train (pd.DataFrame): Training features.
    y_train (pd.Series): Training target.

    Returns:
    best_ridge (Ridge): Trained Ridge regression model with the best found hyperparameters.
    """
    #To set up the GridSearchCV with Ridge regression
    ridge_cv = GridSearchCV(Ridge(), param_grid={'alpha': [0.01, 0.1, 1, 10, 100]}, scoring='neg_mean_squared_error', cv=5)

    #Fitting the model
    ridge_cv.fit(X_train, y_train)

    #To extract the best model
    best_ridge = Ridge(alpha=ridge_cv.best_params_['alpha'])
    best_ridge.fit(X_train, y_train)

    return best_ridge


In [17]:
def train_lasso_model(X_train, y_train):
    """
    Trains a Lasso regression model using GridSearchCV to find the best hyperparameters.

    Parameters:
    X_train (pd.DataFrame): Training features.
    y_train (pd.Series): Training target.

    Returns:
    best_lasso (Lasso): Trained Lasso regression model with the best found hyperparameters.
    """
    #To set up the GridSearchCV with Lasso regression
    lasso_cv = GridSearchCV(Lasso(), param_grid={'alpha': [0.01, 0.1, 1, 10, 100]}, scoring='neg_mean_squared_error', cv=5)

    #Fitting the model
    lasso_cv.fit(X_train, y_train)

    #To extract the best model
    best_lasso = Lasso(alpha=lasso_cv.best_params_['alpha'])
    best_lasso.fit(X_train, y_train)

    return best_lasso

In [18]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluates a machine learning model using mean squared error and R-squared metrics.

    Parameters:
    model (sklearn model): The trained machine learning model.
    X_test (pd.DataFrame): Test features.
    y_test (pd.Series): True values for the test set.

    Returns:
    mse (float): Mean squared error of the model on the test set.
    r2 (float): R-squared score of the model on the test set.
    """
    #predicts values using the model
    y_pred = model.predict(X_test)

    #To calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)

    #To calculate R-squared score
    r2 = r2_score(y_test, y_pred)

    return mse, r2

In [19]:
def analyze_coefficients(model, model_name="Model"):
    """
    Analyzes and prints the coefficients of a linear model.

    Parameters:
    model (sklearn model): The trained linear model.
    model_name (str): The name of the model.

    Returns:
    None
    """
    print(f"{model_name} Coefficients:", model.coef_)
    print(f"{model_name} Intercept:", model.intercept_)


In [11]:
%whos

Variable                    Type        Data/Info
-------------------------------------------------
GridSearchCV                ABCMeta     <class 'sklearn.model_sel<...>on._search.GridSearchCV'>
Lasso                       ABCMeta     <class 'sklearn.linear_mo<...>oordinate_descent.Lasso'>
Ridge                       ABCMeta     <class 'sklearn.linear_model._ridge.Ridge'>
StandardScaler              type        <class 'sklearn.preproces<...>ng._data.StandardScaler'>
calculate_iqr               function    <function calculate_iqr at 0x000001FFFC747C40>
load_and_process_data       function    <function load_and_proces<...>ta at 0x000001FFFC76C040>
load_filter_data            function    <function load_filter_data at 0x000001FFFC60F100>
mean_squared_error          function    <function mean_squared_er<...>or at 0x000001FFFC5B9300>
np                          module      <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
pd                          module      <module 'pandas' from '