In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from tabulate import tabulate
import seaborn as sns
%matplotlib inline

# 1. Load Data Function
def load_data(file_path):
    """
    Load a CSV file into a DataFrame with error handling.

    Parameters:
    file_path (str): Path to the CSV file.

    Returns:
    pd.DataFrame: Loaded DataFrame.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded data from {file_path}\n")
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: The file {file_path} was not found.")
    except pd.errors.EmptyDataError:
        raise ValueError("Error: The file is empty.")
    except pd.errors.ParserError:
        raise ValueError("Error: The file could not be parsed.")



In [None]:
# 2. Exploratory Data Analysis
def display_data_info(df):
    """
    Display detailed information about the DataFrame, including:
    - Number of rows and columns.
    - Number of columns with missing data.
    - Data type of each column.
    - Summary statistics for numerical and categorical data.
    - Percentage of missing values per column.
    - Number of duplicate rows.
    - Number of unique values per column.
    - Correlation matrix for numerical columns.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    """
    print("\nAnalyzing the dataset...\n")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")

    # Missing Data Information
    missing_counts = df.isna().sum()
    num_cols_missing = (missing_counts > 0).sum()
    print(f"Number of columns with missing data: {num_cols_missing}")

    if num_cols_missing > 0:  # Only show details if there are any missing values.
        missing_percentage = (missing_counts / len(df)) * 100
        missing_info = pd.DataFrame({
            'Missing Count': missing_counts,
            'Missing Percentage': missing_percentage
        }).round(2)
        print("\nMissing Value Information:")
        print(tabulate(missing_info, headers='keys', tablefmt='pretty', showindex=True))  #Show index (column names)


    # Data Type Information
    print("\nData types of each column:")
    print(tabulate(df.dtypes.reset_index(), headers=['Column', 'Data Type'], tablefmt='pretty'))

    # Summary Statistics (Numerical)
    print("\nSummary statistics for numerical data:")
    numerical_summary = df.describe(include=np.number).round(2)
    print(tabulate(numerical_summary, headers='keys', tablefmt='pretty'))


    # Summary Statistics (Categorical) - Improved
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns #Handles both object and category dtypes
    if categorical_cols.size > 0:
        print("\nSummary statistics for categorical data:")
        categorical_summary = df[categorical_cols].describe().T
        categorical_summary['Distinct Count'] = df[categorical_cols].nunique() #Adds distinct count, handles missing values better.
        print(tabulate(categorical_summary, headers='keys', tablefmt='pretty'))

    # Unique Values
    print("\nNumber of unique values per column:")
    print(tabulate(df.nunique().reset_index(), headers=['Column', 'Unique Values'], tablefmt='pretty'))


    # Correlation Matrix (Numerical)
    print("\nCorrelation matrix for numerical columns:")
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr().round(2)
    print(tabulate(correlation_matrix, headers='keys', tablefmt='psql'))  # 'psql' is visually nicer for matrices


    


In [None]:
# 3. Clean Data Function
def clean_data(df):
    """
    Clean the dataset by removing empty columns and preprocessing 'last_review'.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame to clean.

    Returns:
    pd.DataFrame: Cleaned DataFrame with:
        - Columns with 100% missing values removed.
        - 'last_review' column converted to datetime and missing values imputed.
    """
    
    # Remove columns with all missing values
    df = df.dropna(axis=1, how='all')

    # Convert 'last_review' to datetime and fill missing with placeholder
    df.loc[:, 'last_review'] = pd.to_datetime(df['last_review'], errors='coerce').dt.date
    df.loc[:, 'last_review'] = df['last_review'].fillna('No reviews')

    return df

In [None]:
# 4. Impute Missing Values
def impute_missing_values(df):
    """
    Impute missing values for numerical columns.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with imputed values.
    """
    # Impute 'reviews_per_month' and 'price' with the median
    
    # Rationale: The median is robust to outliers and provides a central value for imputation.
    df['reviews_per_month'] = df['reviews_per_month'].fillna(df['reviews_per_month'].median())
    
    # Rationale: Using the median for price ensures outlier influence is minimized.
    df['price'] = df['price'].fillna(df['price'].median())
    return df

In [None]:

# 5. Plot Numerical Histograms
def plot_numerical_histograms(df):
    """Plots histograms for numerical columns, skipping empty ones.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    """

    # Select numerical types
    numerical_cols = df.select_dtypes(include=np.number)

    cols_to_plot = []  # Keep track of numerical and non-empty columns.
    for col in numerical_cols:
        if not pd.isna(df[col]).all():  # Check if not all values are NaN
            cols_to_plot.append(col)
    
    if not cols_to_plot:  # If there are no numerical columns to plot, return a message.
        print("No numerical columns to plot in DataFrame.")
        return

    num_cols = len(cols_to_plot)  # Only includes non-empty columns.
    num_rows = (num_cols + 2) // 3  # Correct the number of rows to display.

    fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(12, 8))
    axes = axes.ravel()  # Flatten axes array for iteration.

    for i, col in enumerate(cols_to_plot):  # Only iterate through non-empty cols
        ax = axes[i]
        sns.histplot(df[col], bins=30, kde=True, ax=ax)
        ax.set_title(col)
        ax.tick_params(axis='x', rotation=45)

    # Turn off any extra axes if the number of plots is not a multiple of 3
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

