## Objective:

To forecast Canadian house prices by analyzing the relationship between average Canadian household income and house prices over the past 20 years.

## Data Sources:

Average Canadian household income data for the past 20 years
Canadian house price data for the past 20 years


## Adding income data and merged into previous data(house price)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

class IncomeDataProcessor:
    """
    A class to process income and house price data.
    """

    def __init__(self, income_data_path, original_data_path):
        """
        Initializes the class with the given data paths.

        Args:
            income_data_path (str): Path to the income data CSV file.
            original_data_path (str): Path to the original data Excel file.
        """
        self.income_data = pd.read_csv(income_data_path)
        self.original_data = pd.read_excel(original_data_path)

    def clean_data(self):
        """
        Cleans the income data by removing currency symbols and converting to numeric.
        """
        self.income_data['Median income'] = self.income_data['Median income'].str.replace('$', '').str.replace(',', '').astype(float)

    def convert_to_monthly(self):
        """
        Converts annual income to monthly income and creates a new DataFrame.
        """
        # ... (your existing code for converting to monthly data)
        # ...

    def merge_data(self):
        """
        Merges the income data with the original data based on the 'Date' column.
        """
        self.monthly_data['Date'] = pd.to_datetime(self.monthly_data['Date'], format='%Y-%m-%d')
        self.merged_data = pd.merge(self.original_data, self.monthly_data[['Date', 'Median income']], on='Date', how='left')

    def exploratory_data_analysis(self):
        """
        Performs exploratory data analysis, including:
            - Visualizing the relationship between income and house price
            - Calculating correlation coefficient and p-value
            - Fitting a linear regression model
        """
        # Visualize relationship
        sns.scatterplot(x='Median income', y='House price', data=self.merged_data)
        plt.show()

        # Correlation analysis
        corr, p_value = stats.pearsonr(self.merged_data['Median income'], self.merged_data['House price'])
        print(f'Pearson correlation coefficient: {corr:.2f}')
        print(f'p-value: {p_value:.4f}')

        # Linear regression
        X = self.merged_data['Median income']
        y = self.merged_data['House price']
        X = sm.add_constant(X)
        model = sm.OLS(y, X).fit()
        print(model.summary())

    def save_data(self, output_path):
        """
        Saves the merged data to a CSV file.
        """
        self.merged_data.to_csv(output_path, index=False)

# Example usage
processor = IncomeDataProcessor('incomedata.csv', 'News_release_chart_data_August_2024.xlsx')
processor.clean_data()
processor.convert_to_monthly()
processor.merge_data()
processor.exploratory_data_analysis()
processor.save_data('mergeddata.csv')

## Hypothesis Testing :

Null Hypothesis (H0): There is no significant linear correlation between average Canadian household income and house prices.
Alternative Hypothesis (H1): There is a significant positive linear correlation between average Canadian household income and house prices (i.e., as income increases, house prices also increase).

## Calculate test statistic

In [3]:
import pandas as pd
import numpy as np
from scipy import stats

class HousingDataAnalysis:
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        print(self.data)

    def calculate_correlation(self, x_col, y_col):
        """Calculates the Pearson correlation coefficient between two columns.

        Args:
            x_col (str): The name of the first column.
            y_col (str): The name of the second column.

        Returns:
            float: The Pearson correlation coefficient.
        """

        corr, p_value = stats.pearsonr(self.data[x_col], self.data[y_col])
        return corr, p_value

    def hypothesis_test(self, x_col, y_col, alpha=0.05):
        """Performs a hypothesis test to determine if there is a significant linear relationship between two columns.

        Args:
            x_col (str): The name of the first column.
            y_col (str): The name of the second column.
            alpha (float, optional): The significance level. Defaults to 0.05.
        """
        
        corr, p_value = self.calculate_correlation(x_col, y_col)

        if p_value < alpha:
            print(f"There is a significant linear relationship between {x_col} and {y_col} (p-value = {p_value:.4f})")
        else:
            print(f"There is no significant linear relationship between {x_col} and {y_col} (p-value = {p_value:.4f})")

# Usage example
data_path = "mergeddata.csv"
analysis = HousingDataAnalysis(data_path)

# Calculate correlation and perform hypothesis test
corr, p_value = analysis.calculate_correlation("Apartment_unit", "Median income")
analysis.hypothesis_test("Apartment_unit", "Median income")

           Date  Composite  One_storey  Two_storey  Townhouse  Apartment_unit  \
0    2005-01-01     239800      207700      302900     201700          172000   
1    2005-02-01     240500      208400      303300     202300          173000   
2    2005-03-01     241300      209200      304300     202900          173900   
3    2005-04-01     242000      210100      304800     203300          174600   
4    2005-05-01     242600      210600      305400     203700          175400   
..          ...        ...         ...         ...        ...             ...   
230  2024-03-01     718200      612900      928700     663600          525200   
231  2024-04-01     718100      613900      929600     661800          523200   
232  2024-05-01     716700      613100      928300     661100          521800   
233  2024-06-01     717600      615700      929400     661000          521200   
234  2024-07-01     718700      616200      931100     660800          521900   

     Median income  
0     

## Summary