# Housing Analysis with POO (Python Objet-oriented Programming) Approach
This notebook demonstrates how to perform exploratory data analysis on a housing dataset using Python's object-oriented programming (POO) principles. The analysis includes data loading, processing, visualization, and linear regression modeling.

In [1]:
# Setup & Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Define the LogLinearModel class
class LogLinearModel:
    def __init__(self, file_path):
        self.file_path = file_path
        self.model = LinearRegression()
        self.df = None
        self.X = self.y = None
        self.X_train = self.X_test = None
        self.y_train = self.y_test = None
        self.y_pred = self.y_log_pred = None

    def load_and_prepare_data(self, target_column, drop_columns=[]):
        self.df = pd.read_csv(self.file_path).dropna()
        self.df = self.df.drop(columns=drop_columns)
        self.y = self.df[target_column]
        self.X = self.df.drop(columns=[target_column])

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42
        )

    def train(self):
        y_log_train = np.log(self.y_train)
        self.model.fit(self.X_train, y_log_train)

    def predict(self):
        self.y_log_pred = self.model.predict(self.X_test)
        self.y_pred = np.exp(self.y_log_pred)

    def evaluate(self):
        if self.y_pred is None:
            self.predict()
        r2 = r2_score(self.y_test, self.y_pred)
        rmse = mean_squared_error(self.y_test, self.y_pred, squared=False)
        print("R²:", round(r2, 3))
        print("RMSE:", round(rmse, 3))
        print("\nModel Coefficients:")
        for feature, coef in zip(self.X.columns, self.model.coef_):
            print(f"{feature}: {round(coef, 4)}")

    def plot_residuals(self):
        if self.y_pred is None:
            self.predict()
        residuals = self.y_test - self.y_pred
        plt.scatter(self.y_pred, residuals)
        plt.axhline(0, color='red', linestyle='--')
        plt.xlabel("Predicted Values")
        plt.ylabel("Residuals")
        plt.title("Residuals vs Predicted Values")
        plt.grid(True)
        plt.show()

    def plot_predictions_vs_actuals(self):
        if self.y_pred is None:
            self.predict()
        plt.scatter(self.y_test, self.y_pred, alpha=0.7)
        plt.plot([self.y_test.min(), self.y_test.max()],
                 [self.y_test.min(), self.y_test.max()],
                 'r--')
        plt.xlabel("Actual Values")
        plt.ylabel("Predicted Values")
        plt.title("Predicted vs Actual")
        plt.grid(True)
        plt.show()

    def plot_qqplot(self):
        if self.y_pred is None:
            self.predict()
        residuals = self.y_test - self.y_pred
        sm.qqplot(residuals, line='45')
        plt.title("Q-Q Plot of Residuals")
        plt.grid()
        plt.show()

    def save_model(self, filename="log_linear_model.pkl"):
        joblib.dump(self.model, filename)
        print(f"Model saved to {filename}")

    def load_model(self, filename="log_linear_model.pkl"):
        self.model = joblib.load(filename)
        print(f"Model loaded from {filename}")

    def get_feature_importance(self, plot=True):
        if self.model is None or self.X is None:
            return
        importance = pd.Series(self.model.coef_, index=self.X.columns)
        importance = importance.abs().sort_values(ascending=True)
        if plot:
            importance.plot(kind='barh')
            plt.title("Feature Importance (Absolute Coefficients)")
            plt.grid()
            plt.show()
        return importance

    def plot_time_series(self, date_column="Date", target_column="TempMoyenne", freq="M"):
        df = self.df.copy()
        df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
        df = df.dropna(subset=[date_column])
        df.set_index(date_column, inplace=True)
        ts = df[target_column].resample(freq).mean()
        ts.plot(figsize=(12, 4), title=f"{target_column} over time")
        plt.xlabel("Date")
        plt.ylabel(target_column)
        plt.grid()
        plt.show()

    def plot_moving_average(self, date_column="Date", target_column="TempMoyenne", window=3):
        df = self.df.copy()
        df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
        df = df.dropna(subset=[date_column])
        df.set_index(date_column, inplace=True)
        ts = df[target_column].resample("M").mean()
        ma = ts.rolling(window=window).mean()
        ts.plot(label="Monthly Avg", figsize=(12, 4))
        ma.plot(label=f"{window}-Month Moving Avg")
        plt.title(f"{target_column} with Moving Average")
        plt.xlabel("Date")
        plt.ylabel(target_column)
        plt.legend()
        plt.grid()
        plt.show()

    def plot_seasonal_decomposition(self, date_column="Date", target_column="TempMoyenne", freq="M", model="additive"):
        df = self.df.copy()
        df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
        df = df.dropna(subset=[date_column])
        df.set_index(date_column, inplace=True)
        ts = df[target_column].resample(freq).mean()
        decomposition = seasonal_decompose(ts, model=model, period=12)
        decomposition.plot()
        plt.suptitle("Seasonal Decomposition", fontsize=16)
        plt.tight_layout()
        plt.show()

In [3]:
# Initialize and use the LogLinearModel class
model = LogLinearModel("./data/climat_haiti.csv")
model.load_and_prepare_data(target_column="TempMoyenne", drop_columns=["Date"])
model.train()
model.predict()
model.evaluate()

FileNotFoundError: [Errno 2] No such file or directory: './data/climat_haiti.csv'

In [None]:
# Regression Diagnostics
model.plot_predictions_vs_actuals()
model.plot_residuals()
model.plot_qqplot()
model.get_feature_importance()

In [None]:
# Time Series Analysis
model.plot_time_series(date_column="Date", target_column="TempMoyenne", freq="M")
model.plot_moving_average(date_column="Date", target_column="TempMoyenne", window=6)
model.plot_seasonal_decomposition(date_column="Date", target_column="TempMoyenne", freq="M", model="additive")