In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

# Assignment: Linear and logistic regression
The assignment consists of constructing two separate models for predicting the real estate prices in the dataset: one with linear and one with logistic regression.

1. Linear regression model: construct a linear regression model for predicting the continuous target variable "Y house price of unit area" in the dataset.

2. Logistic regression model: convert the target variable into a binary-valued one according to whether the original target value is above or below the average house price of unit area (within the training set samples), and construct a binary classifier for predicting its value with logistic regression.

Both models should be validated, with appropriate metrics presented and discussed.

In [28]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
real_estate_valuation = fetch_ucirepo(id=477)

# data (as pandas dataframes)
X = real_estate_valuation.data.features
y = real_estate_valuation.data.targets

# variable information
print (real_estate_valuation.data)



{'ids':       No
0      1
1      2
2      3
3      4
4      5
..   ...
409  410
410  411
411  412
412  413
413  414

[414 rows x 1 columns], 'features':      X1 transaction date  X2 house age  \
0               2012.917          32.0   
1               2012.917          19.5   
2               2013.583          13.3   
3               2013.500          13.3   
4               2012.833           5.0   
..                   ...           ...   
409             2013.000          13.7   
410             2012.667           5.6   
411             2013.250          18.8   
412             2013.000           8.1   
413             2013.500           6.5   

     X3 distance to the nearest MRT station  X4 number of convenience stores  \
0                                  84.87882                               10   
1                                 306.59470                                9   
2                                 561.98450                                5   
3                     

In [35]:
X.info()
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 6 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   X1 transaction date                     414 non-null    float64
 1   X2 house age                            414 non-null    float64
 2   X3 distance to the nearest MRT station  414 non-null    float64
 3   X4 number of convenience stores         414 non-null    int64  
 4   X5 latitude                             414 non-null    float64
 5   X6 longitude                            414 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 19.5 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 1 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Y house price of unit area  414 non-null    float64
dtypes: float64(1)

## Business Understanding

###### The goal:
- The goal of this analysis is to create models that can predict house prices per unit area using different property features, such as:

    - House age  
    - Distance to the nearest MRT station  
    - Number of convenience stores nearby  
    - Latitude and longitude of the property  
    - Transaction date 

###### Requirements & Limits:

- Data: The dataset has 414 records with important details about house prices.
    Feature Processing: Some changes might be needed, like adjusting values, handling unusual data, or changing variables.
    Model Choice: Linear regression will be used to predict prices, and logistic regression will be used if classification is needed.

###### Expected Results:

- A model that can predict house prices based on property details.
    Insights into which factors impact house prices the most.
  

## Data Understanding
 
- We use the market historical data set of real estate valuation are collected from Sindian Dist., New Taipei City, Taiwan. 
- The dataset consists of 414 rows and 8 columns to real estate transactions.
- Availible data:
    - Transaction date
    - house age
    - distance ti the nearest MRT station
    - number of convenience stores
    - latitude
    - longitude
    - house price of unit area

- Data type: float64(5), int64(1)
- value distributions:
   use .describe() to show the value distributions, 
   - for example, house age:
      - Range: 0 to 43.8 years
      - mean: 17.71 years
      - std Dev: 11.39
      - Interpretation: Houses vary widely in age, with a good number of newer houses (0 years).
- No missing values
- Latitude & Longitude: The range seems reasonable, but mapping the coordinates could reveal inconsistencies.


In [42]:
# Get descriptive statistics
describe_x = X.describe()
describe_y = y.describe()

# Join both summaries
summary = describe_x.join(describe_y)

# Display result
print(summary)


       X1 transaction date  X2 house age  \
count           414.000000    414.000000   
mean           2013.148971     17.712560   
std               0.281967     11.392485   
min            2012.667000      0.000000   
25%            2012.917000      9.025000   
50%            2013.167000     16.100000   
75%            2013.417000     28.150000   
max            2013.583000     43.800000   

       X3 distance to the nearest MRT station  \
count                              414.000000   
mean                              1083.885689   
std                               1262.109595   
min                                 23.382840   
25%                                289.324800   
50%                                492.231300   
75%                               1454.279000   
max                               6488.021000   

       X4 number of convenience stores  X5 latitude  X6 longitude  \
count                       414.000000   414.000000    414.000000   
mean                   

In [36]:
X.info()
y.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 6 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   X1 transaction date                     414 non-null    float64
 1   X2 house age                            414 non-null    float64
 2   X3 distance to the nearest MRT station  414 non-null    float64
 3   X4 number of convenience stores         414 non-null    int64  
 4   X5 latitude                             414 non-null    float64
 5   X6 longitude                            414 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 19.5 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 1 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Y house price of unit area  414 non-null    float64
dtypes: float64(1)

In [32]:
print(X.dtypes)
print(y.dtypes)

X1 transaction date                       float64
X2 house age                              float64
X3 distance to the nearest MRT station    float64
X4 number of convenience stores             int64
X5 latitude                               float64
X6 longitude                              float64
dtype: object
Y house price of unit area    float64
dtype: object


## Data Preparation
- The third phase is to preprocess the data.
- This includes cleaning the data, transforming the data, and selecting the relevant features.
- These steps should be documented in such detail that they can be reproduced later.

In [8]:
print(real_estate_valuation.variables.name)



0                                        No
1                       X1 transaction date
2                              X2 house age
3    X3 distance to the nearest MRT station
4           X4 number of convenience stores
5                               X5 latitude
6                              X6 longitude
7                Y house price of unit area
Name: name, dtype: object


## Modeling
- The fourth phase is to choose a machine learning method and train the model.
- This phase also includes the validation of the model.
- Documentation needs include:
-- which method was used, which parameters were used, what was the performance of the model?

## Evaluation
- The fifth phase is to evaluate the model.
- How well does the model perform?
- Does it meet the business requirements?

## Deployment
- The final phase is to deploy the model.
- How will the model be used in practice?
- How will the results be communicated?
- This phase may involve creating a recommendation of how to use the model in practice, or what to do next.