# Group 5
## Team members:
- Jianting Liu(8950907)
- David (8999846) 
- Marieth (9016702)


Megeing data

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.__version__
class IncomeDataProcessor:
    """
    A class to process income and house price data.
    """

    def __init__(self, income_data_path, original_data_path):
        """
        Initializes the class with the given data paths.

        Args:
            income_data_path (str): Path to the income data CSV file.
            original_data_path (str): Path to the original data Excel file.
        """
        self.income_data = pd.read_csv(income_data_path)
        self.original_data = pd.read_excel(original_data_path)

    def clean_data(self):
        """
        Cleans the income data by removing currency symbols and converting to numeric.
        """
        self.income_data['Median income'] = self.income_data['Median income'].str.replace('$', '').str.replace(',', '').astype(float)

    def convert_to_monthly(self):
        """
        Converts annual income to monthly income and creates a new DataFrame.
        """
        # Create a list to store new data
        new_rows = []
        
        # Loop for income data
        for index, row in self.income_data.iterrows():
            year = row["Reference year"]
            median_income = row["Median income"] / 12  # Convert to monthly income
            
            for month in range(1, 13):
                date_str = f"{year}-{month:02d}-01"
                date = pd.to_datetime(date_str, format='%Y-%m-%d')
                
                new_row = {
                    "Date": date,
                    "Median income": median_income,
                    "Count of families": row["Count of families"],
                    "Family type": row["Family type"],
                    "Geography": row["Geography"],
                    "Selected income concept": row["Selected income concept"]
                }
                new_rows.append(new_row)
        
        # Create new DataFrame and store it as monthly_data
        self.monthly_data = pd.DataFrame(new_rows)
        
        return self.monthly_data

    def merge_data(self):
        """
        Merges the income data with the original data based on the 'Date' column.
        """
        self.monthly_data['Date'] = pd.to_datetime(self.monthly_data['Date'], format='%Y-%m-%d')
        self.merged_data = pd.merge(self.original_data, self.monthly_data[['Date', 'Median income']], on='Date', how='left')
        # Filter data from 2005 to 2022
        self.merged_data = self.merged_data[(self.merged_data['Date'].dt.year >= 2005) & 
                                           (self.merged_data['Date'].dt.year <= 2022)]
        return self.merged_data
    
    def save_data(self, output_path):
        """
        Saves the merged data to a CSV file.
        """
        self.merged_data.to_csv(output_path)

# Example usage
processor = IncomeDataProcessor('incomedata.csv', 'News_release_chart_data_August_2024.xlsx')
processor.clean_data()
processor.convert_to_monthly()
data = processor.merge_data()
data
#processor.save_data('mergeddata.csv')

Unnamed: 0,Date,Composite,One_storey,Two_storey,Townhouse,Apartment_unit,Median income
0,2005-01-01,239800,207700,302900,201700,172000,4487.500000
1,2005-02-01,240500,208400,303300,202300,173000,4487.500000
2,2005-03-01,241300,209200,304300,202900,173900,4487.500000
3,2005-04-01,242000,210100,304800,203300,174600,4487.500000
4,2005-05-01,242600,210600,305400,203700,175400,4487.500000
...,...,...,...,...,...,...,...
211,2022-08-01,749600,630200,957700,693800,544100,5066.666667
212,2022-09-01,739200,619400,947700,684900,537100,5066.666667
213,2022-10-01,731400,613300,937000,679200,532400,5066.666667
214,2022-11-01,725100,607900,928800,672800,530000,5066.666667
