In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from tabulate import tabulate
import seaborn as sns
%matplotlib inline

# 1. Load Data Function
def load_data(file_path):
    """
    Load a CSV file into a DataFrame with error handling.

    Parameters:
    file_path (str): Path to the CSV file.

    Returns:
    pd.DataFrame: Loaded DataFrame.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded data from {file_path}\n")
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: The file {file_path} was not found.")
    except pd.errors.EmptyDataError:
        raise ValueError("Error: The file is empty.")
    except pd.errors.ParserError:
        raise ValueError("Error: The file could not be parsed.")



In [None]:
# 2. Exploratory Data Analysis
def display_data_info(df):
    """
    Display detailed information about the DataFrame, including:
    - Number of rows and columns.
    - Number of columns with missing data.
    - Data type of each column.
    - Summary statistics for numerical and categorical data.
    - Percentage of missing values per column.
    - Number of duplicate rows.
    - Number of unique values per column.
    - Correlation matrix for numerical columns.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    """
    print("\nAnalyzing the dataset...\n")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")

    # Missing Data Information
    missing_counts = df.isna().sum()
    num_cols_missing = (missing_counts > 0).sum()
    print(f"Number of columns with missing data: {num_cols_missing}")

    if num_cols_missing > 0:  # Only show details if there are any missing values.
        missing_percentage = (missing_counts / len(df)) * 100
        missing_info = pd.DataFrame({
            'Missing Count': missing_counts,
            'Missing Percentage': missing_percentage
        }).round(2)
        print("\nMissing Value Information:")
        print(tabulate(missing_info, headers='keys', tablefmt='pretty', showindex=True))  #Show index (column names)


    # Data Type Information
    print("\nData types of each column:")
    print(tabulate(df.dtypes.reset_index(), headers=['Column', 'Data Type'], tablefmt='pretty'))

    # Summary Statistics (Numerical)
    print("\nSummary statistics for numerical data:")
    numerical_summary = df.describe(include=np.number).round(2)
    print(tabulate(numerical_summary, headers='keys', tablefmt='pretty'))


    # Summary Statistics (Categorical) - Improved
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns #Handles both object and category dtypes
    if categorical_cols.size > 0:
        print("\nSummary statistics for categorical data:")
        categorical_summary = df[categorical_cols].describe().T
        categorical_summary['Distinct Count'] = df[categorical_cols].nunique() #Adds distinct count, handles missing values better.
        print(tabulate(categorical_summary, headers='keys', tablefmt='pretty'))

    # Unique Values
    print("\nNumber of unique values per column:")
    print(tabulate(df.nunique().reset_index(), headers=['Column', 'Unique Values'], tablefmt='pretty'))


    # Correlation Matrix (Numerical)
    print("\nCorrelation matrix for numerical columns:")
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr().round(2)
    print(tabulate(correlation_matrix, headers='keys', tablefmt='psql'))  # 'psql' is visually nicer for matrices


    


In [None]:
# 3. Clean Data Function
def clean_data(df):
    """
    Clean the dataset by removing empty columns and preprocessing 'last_review'.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame to clean.

    Returns:
    pd.DataFrame: Cleaned DataFrame with:
        - Columns with 100% missing values removed.
        - 'last_review' column converted to datetime and missing values imputed.
    """
    
    # Remove columns with all missing values
    df = df.dropna(axis=1, how='all')

    # Convert 'last_review' to datetime and fill missing with placeholder
    df.loc[:, 'last_review'] = pd.to_datetime(df['last_review'], errors='coerce').dt.date
    df.loc[:, 'last_review'] = df['last_review'].fillna('No reviews')

    return df

In [None]:
# 4. Impute Missing Values
def impute_missing_values(df):
    """
    Impute missing values for numerical columns.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with imputed values.
    """
    # Impute 'reviews_per_month' and 'price' with the median
    
    # Rationale: The median is robust to outliers and provides a central value for imputation.
    df['reviews_per_month'] = df['reviews_per_month'].fillna(df['reviews_per_month'].median())
    
    # Rationale: Using the median for price ensures outlier influence is minimized.
    df['price'] = df['price'].fillna(df['price'].median())
    return df

In [None]:

# 5. Plot Numerical Histograms
def plot_numerical_histograms(df):
    """Plots histograms for numerical columns, skipping empty ones.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    """

    # Select numerical types
    numerical_cols = df.select_dtypes(include=np.number)

    cols_to_plot = []  # Keep track of numerical and non-empty columns.
    for col in numerical_cols:
        if not pd.isna(df[col]).all():  # Check if not all values are NaN
            cols_to_plot.append(col)
    
    if not cols_to_plot:  # If there are no numerical columns to plot, return a message.
        print("No numerical columns to plot in DataFrame.")
        return

    num_cols = len(cols_to_plot)  # Only includes non-empty columns.
    num_rows = (num_cols + 2) // 3  # Correct the number of rows to display.

    fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(12, 8))
    axes = axes.ravel()  # Flatten axes array for iteration.

    for i, col in enumerate(cols_to_plot):  # Only iterate through non-empty cols
        ax = axes[i]
        sns.histplot(df[col], bins=30, kde=True, ax=ax)
        ax.set_title(col)
        ax.tick_params(axis='x', rotation=45)

    # Turn off any extra axes if the number of plots is not a multiple of 3
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()



In [None]:
# 6.Display DataFrame in chunks of rows and columns
def display_dataframe_in_chunks(df):
    """
    Display the DataFrame in chunks of rows and columns, with user customization.

    Parameters:
    df (pd.DataFrame): The DataFrame to display.

    Behavior:
    - Allows users to specify the number of rows and columns to display interactively.
    - Displays the specified number of rows and columns at a time.
    - Prompts the user to continue viewing the next chunk of rows and columns.
    """

    num_rows, num_cols = df.shape  # Get the number of rows and columns

    # Prompt user for chunk size
    while True:
        try:
            chunk_size = input(f"Enter the number of rows to display per chunk (default is 10, total rows: {num_rows},
                               total columns: {num_cols}): ").strip()
            chunk_size = int(chunk_size) if chunk_size else 10  # Default to 10 if input is empty

            if chunk_size <= 0:
                print("Chunk size must be a positive integer. Please try again.")
                continue
            break  # Exit input loop if successful
        except ValueError:
            print("Invalid input. Please enter a valid integer.")

    for i in range(0, len(df), chunk_size):
        chunk = df[i:i + chunk_size]
        print(tabulate(chunk, headers='keys', tablefmt='pretty'))

        if i + chunk_size < len(df):
            if input("Display next chunk? (yes/no): ").strip().lower() != 'yes':
                break

    print("End of DataFrame reached.")




In [None]:
# 7. Function for Scatter Plot
def plot_scatter(x, y, data, title, xlabel, ylabel, hue=None):
    """
    Helper function to create scatter plots.

    Parameters:
    x (str): Column name for x-axis.
    y (str): Column name for y-axis.
    data (pd.DataFrame): DataFrame containing the data.
    title (str): Title of the plot.
    xlabel (str): Label for x-axis.
    ylabel (str): Label for y-axis.
    hue (str, optional): Column name for color coding.
    """
    plt.figure(figsize=(10, 5))
    sns.scatterplot(x=x, y=y, data=data, alpha=0.6, hue=hue, palette='cool')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if hue:
        plt.legend(title=hue, loc='upper right')
    plt.show()

In [None]:
# 8. Function for Bar Plot
def plot_bar(data, x, y, hue=None, title='', xlabel='', ylabel='', rotation=0):
    """
    Helper function to create bar plots.

    Parameters:
    data (pd.DataFrame): DataFrame containing the data.
    x (str): Column name for x-axis.
    y (str): Column name for y-axis.
    hue (str, optional): Column name for hue (categorical variable).
    title (str): Title of the plot.
    xlabel (str): Label for x-axis.
    ylabel (str): Label for y-axis.
    rotation (int, optional): Rotation angle for x-tick labels.
    """
    plt.figure(figsize=(12, 6))
    
    # If hue is provided, use it for coloring
    if hue:
        sns.barplot(data=data, x=x, y=y, hue=hue, palette='cool')
    else:
        sns.barplot(data=data, x=x, y=y)  # Use default colors if no hue is provided
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=rotation, ha='right')
    
    # Check if hue is provided and has unique values for legend
    if hue: # Improved legend handling
       handles, labels = plt.gca().get_legend_handles_labels()
       if handles: # Check if any handles/labels exist.
           plt.legend(handles, labels, title=hue, loc='best')

    plt.tight_layout()
    plt.show()
    
    
    

In [None]:
# 9. Analyze price factors
def analyze_airbnb_price_factors(df):
    """
    Analyze the factors influencing Airbnb listing prices using linear regression.
    
    Parameters:
    df (pd.DataFrame): The preprocessed DataFrame containing Airbnb data.
    
    Returns:
    pd.DataFrame: A DataFrame containing the coefficients of the linear regression model for each feature.
    
    Process:
    1. Process 'last_review' to create binary features: 'has_last_review' and 'no_last_review'.
    2. Select relevant features and the target variable.
    3. Convert categorical variables to dummy variables for regression analysis.
    4. Split the dataset into training and testing sets.
    5. Train a linear regression model using a pipeline with standard scaling.
    6. Visualize the coefficients to interpret feature importance.
    7. Evaluate the model using R-squared, Mean Squared Error (MSE), and Cross-Validation.
    """
    
    # Create binary features for 'last_review'
    df['has_last_review'] = df['last_review'].apply(lambda x: 1 if x != 'No reviews' else 0)
    df['no_last_review'] = df['last_review'].apply(lambda x: 1 if x == 'No reviews' else 0)
    
    # Selecting relevant features for the analysis
    features = ['room_type', 'neighbourhood', 'minimum_nights', 
                'number_of_reviews', 'reviews_per_month', 'availability_365',
                'has_last_review', 'no_last_review']
    target = 'price'
    
    # Preprocessing: Convert categorical variables to dummy variables
    df = pd.get_dummies(df[features + [target]], drop_first=True)
    
    # Splitting the dataset into training and testing sets
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    
    # Fitting the linear regression model using a pipeline
    model = make_pipeline(StandardScaler(), LinearRegression())
    model.fit(X_train, y_train)
    
    # Extracting and sorting the coefficients by absolute value
    coefficients = pd.DataFrame(
        model.named_steps['linearregression'].coef_, 
        index=X.columns,  # Set the index to the feature names
        columns=['Coefficient']
    )
    coefficients = coefficients.reindex(coefficients['Coefficient'].abs().sort_values(ascending=False).index)
    
    # Resetting the index to create a proper DataFrame for plotting
    coefficients.reset_index(inplace=True)
    coefficients.rename(columns={'index': 'Feature'}, inplace=True)  # Rename the index column for clarity

    # Create a new column to categorize coefficients as positive or negative
    coefficients['Sign'] = coefficients['Coefficient'].apply(lambda x: 'Positive' if x > 0 else 'Negative')

    # Visualizing the coefficients
    plot_bar(coefficients, 
              x='Coefficient', 
              y='Feature',  # Use the renamed column for y
              title='Factors Influencing Airbnb Listing Price', 
              xlabel='Coefficient Value', 
              ylabel='Features', 
              hue='Sign',  # Use hue to color bars based on the sign of the coefficient
              rotation=0)
    
    # Model evaluation
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    print("\nModel Evaluation:")
    print(f"R-squared: {r2:.2f} - Indicates the proportion of variance explained by the model.")
    print(f"Mean Squared Error: {mse:.2f} - Measures the average squared difference between predicted and actual values.")
    
    # Cross-validation
    cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(f"Cross-Validated R-squared: {cross_val_scores.mean():.2f} - Average R-squared across 5 folds.")
    
    return coefficients

In [None]:
# 10.Analyze availability correlation 
def analyze_availability_correlation(df):
    """
    Analyze and visualize the correlation between availability (availability_365)
    and two key features: number of reviews and price.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame containing Airbnb data.
    
    Returns:
    None: Displays scatter plots and correlation values for the analysis.
    """


    # Correlation values
    correlation_reviews = df['availability_365'].corr(df['number_of_reviews'])
    correlation_price = df['availability_365'].corr(df['price'])
    
    print(f"Correlation between availability and number of reviews: {correlation_reviews:.2f}")
    print(f"Correlation between availability and price: {correlation_price:.2f}")
    
    # Scatter plot: Availability vs. Number of Reviews
    plot_scatter('availability_365', 'number_of_reviews', df, 
                 'Availability vs. Number of Reviews', 
                 'Availability (days/year)', 
                 'Number of Reviews', 
                 hue='room_type')

    # Scatter plot: Availability vs. Price
    plot_scatter('availability_365', 'price', df, 
                 'Availability vs. Price', 
                 'Availability (days/year)', 
                 'Price ($)', 
                 hue='room_type')



In [None]:
# 11. Analyze location trends 
def analyze_location_trends(df):
    """
    Analyze location-based trends in reviews, pricing, and popularity.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing Airbnb data.

    Returns:
    None: Displays bar plots and summary statistics highlighting location-based trends.
    """
    
    # Group by neighborhood
    location_stats = df.groupby('neighbourhood').agg({
        'price': 'mean',
        'number_of_reviews': 'mean',
        'reviews_per_month': 'mean'
    }).reset_index()

    # Rename columns for clarity
    location_stats.rename(columns={
        'price': 'Average Price',
        'number_of_reviews': 'Average Number of Reviews',
        'reviews_per_month': 'Average Reviews Per Month'
    }, inplace=True)

    # Sort by Average Price for visualization and rounding to 2 decimal places
    location_stats = location_stats.sort_values(by='Average Price', ascending=False).round(2)

    # Display summary statistics
    print("Location-Based Trends:\n")
    print(tabulate(location_stats, headers='keys', tablefmt='pretty'))

    # Bar plot: Average Price by Neighborhood
    plot_bar(location_stats, 
              x='neighbourhood', 
              y='Average Price', 
              title='Average Price by Neighborhood', 
              xlabel='Neighborhood', 
              ylabel='Average Price ($)', 
              hue='neighbourhood',  # Specify hue for coloring
              rotation=45)

    # Bar plot: Average Number of Reviews by Neighborhood
    plot_bar(location_stats, 
              x='neighbourhood', 
              y='Average Number of Reviews', 
              title='Average Number of Reviews by Neighborhood', 
              xlabel='Neighborhood', 
              ylabel='Average Number of Reviews', 
              hue='neighbourhood',  # Specify hue for coloring
              rotation=45)

    # Bar plot: Average Reviews Per Month by Neighborhood
    plot_bar(location_stats, 
              x='neighbourhood', 
              y='Average Reviews Per Month', 
              title='Average Reviews Per Month by Neighborhood', 
              xlabel='Neighborhood', 
              ylabel='Average Reviews Per Month', 
              hue='neighbourhood',  # Specify hue for coloring
              rotation=45)