Import required libraries for data manipulation and modeling.

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### Read the dataset using Pandas

In [3]:
data = pd.read_csv('HousingData.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [4]:
data.describe()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,486.0,486.0,486.0,486.0,506.0,506.0,486.0,506.0,506.0,506.0,506.0,506.0,486.0,506.0
mean,3.611874,11.211934,11.083992,0.069959,0.554695,6.284634,68.518519,3.795043,9.549407,408.237154,18.455534,356.674032,12.715432,22.532806
std,8.720192,23.388876,6.835896,0.25534,0.115878,0.702617,27.999513,2.10571,8.707259,168.537116,2.164946,91.294864,7.155871,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.0819,0.0,5.19,0.0,0.449,5.8855,45.175,2.100175,4.0,279.0,17.4,375.3775,7.125,17.025
50%,0.253715,0.0,9.69,0.0,0.538,6.2085,76.8,3.20745,5.0,330.0,19.05,391.44,11.43,21.2
75%,3.560263,12.5,18.1,0.0,0.624,6.6235,93.975,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      486 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    486 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


we have verified our dataset and now we can see that we have some missing values and we should handle them
initially i will just drop them but eventually i will try to Imputing Missing Values.

In [6]:
# data_cleaned = data.fillna(d.mean())

In [7]:


# Replace 'NA' with NaN
data.replace('NA', pd.NA, inplace=True)

# Check for missing values again after replacing 'NA' with NaN
missing_values = data.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])

# Drop rows with any missing values
# data_cleaned = data.dropna()
data = data.fillna(data.mean())
# Check dataset information after cleaning
print("\nDataset Info After Cleaning:")
data.info()
# Check for missing values again
missing_values = data.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])
data_types = data.dtypes


Columns with missing values:
CRIM     20
ZN       20
INDUS    20
CHAS     20
AGE      20
LSTAT    20
dtype: int64

Dataset Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB
Columns with missing values:
Series([], dtype: int64)


the next step is to calculate correlations between features and the target column

In [8]:
correlation_matrix = data.corr()
target_correlation = correlation_matrix["MEDV"].abs().sort_values(ascending=False)


In [9]:
print(target_correlation)

MEDV       1.000000
LSTAT      0.721975
RM         0.695360
PTRATIO    0.507787
INDUS      0.478657
TAX        0.468536
NOX        0.427321
RAD        0.381626
AGE        0.380223
CRIM       0.379695
ZN         0.365943
B          0.333461
DIS        0.249929
CHAS       0.179882
Name: MEDV, dtype: float64


### now we can study the correlation between features
and explain the business point of view 
firslty lets see what is MEDV
MEDV -  represents the typical price of homes owned by residents in a specific area

LSTAT(Percent Lower Status of the Population) - cause more people in a neighborhood have lower incomes or jobs with lower statuses, it means that area might not have enough money to have nice things like parks, good schools or stores, it also can mean that there could be more problems like crime, because of that houses in these areas might not be as valuable compared to places where more people have higher incomes or better jobs.

RM (Average Number of Rooms) - more rooms imply larger house, appealing to families with highear income. larger house tend to have higher values influencing median home values. 

PTRATIO (Pupil-Teacher Ratio) - Lower pupil-teacher ratios are linked to better-funded schools and more personalized education. This suggests that the cost of education is higher in these areas, where people with higher incomes tend to live.

Now we will create a second set of data with the columns that have an absolute correlation between 0.5 and 0.8 with the target column

In [10]:
selected_features = target_correlation[(target_correlation >= 0.3) & (target_correlation < 0.8)].index.tolist()
subset_data = data[selected_features + ["MEDV"]]


Split the data into 2 sub-sets using the train_test_split function from sklearn.

In [54]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler on the entire dataset and transform it
data = scaler.fit_transform(data)

X = data[:, :-1]  
y = data[:, -1] 

# Split the scaled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (404, 13)
X_test shape: (102, 13)
y_train shape: (404,)
y_test shape: (102,)


Now we will Train a sklearn Linear Regression model on the data provided to you.

In [62]:
model_sk = LinearRegression()
model_sk.fit(X_train, y_train)



Train a from-scratch implementation of Linear Regression on the train sub-set.

In [69]:
class Linearregression:

    def __init__(self, learning_rate=0.0001, n_iters=100000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            y_predicted = np.dot(X, self.weights) + self.bias
           
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db


    def predict(self, X):
        y_approximated = np.dot(X, self.weights) + self.bias
        return y_approximated

In [70]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
model = Linearregression()
  
model.fit( X_train, y_train )


In [80]:
from sklearn.metrics import r2_score

# Assuming model_sk is your scikit-learn Linear Regression model
model_sk_predictions = model_sk.predict(X_test)
r2_sk = model_sk.score(X_test, y_test)
print(f"Scikit-learn Linear Regression R-squared score: {r2_sk}")

# Assuming model_custom is your custom Linear Regression model
model_custom = Linearregression()
model_custom.fit(X_train, y_train)
model_custom_predictions = model_custom.predict(X_test)
r2_custom = r2_score(y_test, model_custom_predictions)
print(f"Custom Linear Regression R-squared score: {r2_custom}")


Scikit-learn Linear Regression R-squared score: 0.6245520716630016
Custom Linear Regression R-squared score: 0.6245187041915258


In [85]:
# Split the scaled data into train and test sets
subset_data = scaler.fit_transform(subset_data)

X = subset_data[:, :-1]  
y = subset_data[:, -1] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (404, 3)
X_test shape: (102, 3)
y_train shape: (404,)
y_test shape: (102,)


In [86]:
model_sk = LinearRegression()
model_sk.fit(X_train, y_train)

In [87]:
from sklearn.preprocessing import StandardScaler
model = Linearregression()
  
model.fit( X_train, y_train )

In [88]:

# Assuming model_sk is your scikit-learn Linear Regression model
model_sk_predictions = model_sk.predict(X_test)
r2_sk = model_sk.score(X_test, y_test)
print(f"Scikit-learn Linear Regression R-squared score: {r2_sk}")

# Assuming model_custom is your custom Linear Regression model
model_custom = Linearregression()
model_custom.fit(X_train, y_train)
model_custom_predictions = model_custom.predict(X_test)
r2_custom = r2_score(y_test, model_custom_predictions)
print(f"Custom Linear Regression R-squared score: {r2_custom}")

Scikit-learn Linear Regression R-squared score: 0.6245520716630014
Custom Linear Regression R-squared score: 0.6245187041915258


The similarity in R-squared scores across different subsets of data and models suggests that the subset containing columns highly correlated with the target variable retains significant predictive power. However, it's also worth noting that while these columns might have a strong relationship with the target, other factors or interactions not considered in these subsets could contribute to model improvement.

Additionally, the consistency in model performance across different implementations (Scikit-learn vs. custom implementation) indicates the robustness of the linear regression approach and the predictive power of the selected features
