## **Predicting House Prices with Linear Regression**

#### Loading the Data

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "../real_estate.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the data
# This helps us get a sense of what the data looks like
data.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


#### Selecting Features and a Target Variable

In [5]:
# Select features and the target variable
features = data[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']]
target = data['Y house price of unit area']

# Print a summary of the selected features
print(features.head())
print(target.head())

   X2 house age  X3 distance to the nearest MRT station  \
0          32.0                                84.87882   
1          19.5                               306.59470   
2          13.3                               561.98450   
3          13.3                               561.98450   
4           5.0                               390.56840   

   X4 number of convenience stores  X5 latitude  X6 longitude  
0                               10     24.98298     121.54024  
1                                9     24.98034     121.53951  
2                                5     24.98746     121.54391  
3                                5     24.98746     121.54391  
4                                5     24.97937     121.54245  
0    37.9
1    42.2
2    47.3
3    54.8
4    43.1
Name: Y house price of unit area, dtype: float64


#### Splitting the Data

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Check the shape of the training and testing data
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(331, 5) (83, 5)
(331,) (83,)


#### Training the Linear Regression Model

In [7]:
from sklearn.linear_model import LinearRegression

# Create a linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Display the coefficients of the model
print("Model Coefficients:", model.coef_)

Model Coefficients: [-2.70593236e-01 -4.55249601e-03  1.10512079e+00  2.36092831e+02
 -2.39036942e+01]


#### Evaluating the Model

In [8]:
from sklearn.metrics import root_mean_squared_error, r2_score

# Use the model to make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error and R-squared
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Root Mean Squared Error: 7.387891796775484
R-squared: 0.6746481382828156


## **Diagnosing Diabetes Using Different Classification Models**

#### Loading Data

In [9]:
# Load the dataset
file_path = "../diabetes.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the data
# This helps us get a sense of what the data looks like
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### Cleaning Up the Data

In [10]:
# Remove rows where 'Insulin', 'SkinThickness', or 'BloodPressure' have a value of 0
df_cleaned = data[(data['Insulin'] != 0) & (data['SkinThickness'] != 0) & (data['BloodPressure'] != 0)]

# Display the cleaned data
df_cleaned.head()

# Check the shape of the data after cleaning
print("Original dataset shape:", data.shape)
print("Cleaned dataset shape:", df_cleaned.shape)

data = df_cleaned

Original dataset shape: (768, 9)
Cleaned dataset shape: (394, 9)


#### Selecting Features and a Target Variable

In [11]:
# Select features and target variable
features = data[['Glucose', 'BloodPressure', 'BMI', 'Age']]
target = data['Outcome']

# Display the selected features
print(features.head())
print(target.head())

    Glucose  BloodPressure   BMI  Age
3        89             66  28.1   21
4       137             40  43.1   33
6        78             50  31.0   26
8       197             70  30.5   53
13      189             60  30.1   59
3     0
4     1
6     1
8     1
13    1
Name: Outcome, dtype: int64


#### Splitting the Data

In [12]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Check the shapes of the training and testing sets
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(315, 4) (79, 4)
(315,) (79,)


#### Training and Evaluating K-Nearest Neighbors (KNN) Classifier

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Create and train a KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn.predict(X_test)

# Evaluate the model performance
accuracy_knn = accuracy_score(y_test, y_pred_knn)
confusion_knn = confusion_matrix(y_test, y_pred_knn)

# Print the accuracy and confusion matrix
print("KNN Accuracy:", accuracy_knn)
print("KNN Confusion Matrix:\n", confusion_knn)

KNN Accuracy: 0.6835443037974683
KNN Confusion Matrix:
 [[43 11]
 [14 11]]


#### Training and Evaluating Decision Tree Classifier

In [14]:
from sklearn.tree import DecisionTreeClassifier

# Create and train a Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt.predict(X_test)

# Evaluate the model performance
accuracy_dt = accuracy_score(y_test, y_pred_dt)
confusion_dt = confusion_matrix(y_test, y_pred_dt)

# Print the accuracy and confusion matrix
print("Decision Tree Accuracy:", accuracy_dt)
print("Decision Tree Confusion Matrix:\n", confusion_dt)

Decision Tree Accuracy: 0.7341772151898734
Decision Tree Confusion Matrix:
 [[45  9]
 [12 13]]


#### Comparing KNN and Decision Tree Models

In [15]:
# Compare the accuracy scores and confusion matrices of the two models
print("KNN Accuracy vs Decision Tree Accuracy:", accuracy_knn, accuracy_dt)
print("KNN Confusion Matrix:\n", confusion_knn)
print("Decision Tree Confusion Matrix:\n", confusion_dt)

KNN Accuracy vs Decision Tree Accuracy: 0.6835443037974683 0.7341772151898734
KNN Confusion Matrix:
 [[43 11]
 [14 11]]
Decision Tree Confusion Matrix:
 [[45  9]
 [12 13]]
