# import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder , RobustScaler , StandardScaler, MinMaxScaler , OneHotEncoder , PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

# read and explore data

In [2]:
# Load dataset
data = pd.read_csv(r'C:\Users\Mahmoud\Downloads\python practical\regression_data.csv' , header=None)  

# Display the first few rows of the
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [4]:
data.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
47,48,20,FV,84.0,11096,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2007,WD,Normal,249700
236,237,20,RL,65.0,8773,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,185500
1070,1071,20,RL,72.0,10152,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2007,WD,Normal,135000
394,395,50,RL,60.0,10134,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2007,WD,Normal,109000
1350,1351,90,RL,91.0,11643,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2009,WD,Normal,200000


In [5]:
data.shape

(1460, 81)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1460 non-null   int64  
 1   1       1460 non-null   int64  
 2   2       1460 non-null   object 
 3   3       1201 non-null   float64
 4   4       1460 non-null   int64  
 5   5       1460 non-null   object 
 6   6       91 non-null     object 
 7   7       1460 non-null   object 
 8   8       1460 non-null   object 
 9   9       1460 non-null   object 
 10  10      1460 non-null   object 
 11  11      1460 non-null   object 
 12  12      1460 non-null   object 
 13  13      1460 non-null   object 
 14  14      1460 non-null   object 
 15  15      1460 non-null   object 
 16  16      1460 non-null   object 
 17  17      1460 non-null   int64  
 18  18      1460 non-null   int64  
 19  19      1460 non-null   int64  
 20  20      1460 non-null   int64  
 21  21      1460 non-null   object 
 22  

In [7]:
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

len(numerical_cols), len(categorical_cols)

(38, 43)

In [8]:
data.describe()

Unnamed: 0,0,1,3,4,17,18,19,20,26,34,...,66,67,68,69,70,71,75,76,77,80
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [9]:
data.describe(include='object')

Unnamed: 0,2,5,6,7,8,9,10,11,12,13,...,58,60,63,64,65,72,73,74,78,79
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


# check on duplicates

In [10]:
data.duplicated().sum()

np.int64(0)

# check and handle for missing values

In [11]:
data.isnull().sum()

0       0
1       0
2       0
3     259
4       0
     ... 
76      0
77      0
78      0
79      0
80      0
Length: 81, dtype: int64

In [12]:
data.isnull().sum().sum()

np.int64(7829)

In [13]:
# get cols with missing values
missing_cols = data.columns[data.isnull().any()].tolist()
len(missing_cols) , missing_cols

(19,
 [3, 6, 25, 26, 30, 31, 32, 33, 35, 42, 57, 58, 59, 60, 63, 64, 72, 73, 74])

In [14]:
for col in data.columns:
  if data[col].isnull().sum() > 0:
    print(col,"------>",data[col].isnull().sum()," -----> ", round(data[col].isnull().sum()/data.shape[0]*100,2), "%","------>",data[col].dtype)

3 ------> 259  ----->  17.74 % ------> float64
6 ------> 1369  ----->  93.77 % ------> object
25 ------> 872  ----->  59.73 % ------> object
26 ------> 8  ----->  0.55 % ------> float64
30 ------> 37  ----->  2.53 % ------> object
31 ------> 37  ----->  2.53 % ------> object
32 ------> 38  ----->  2.6 % ------> object
33 ------> 37  ----->  2.53 % ------> object
35 ------> 38  ----->  2.6 % ------> object
42 ------> 1  ----->  0.07 % ------> object
57 ------> 690  ----->  47.26 % ------> object
58 ------> 81  ----->  5.55 % ------> object
59 ------> 81  ----->  5.55 % ------> float64
60 ------> 81  ----->  5.55 % ------> object
63 ------> 81  ----->  5.55 % ------> object
64 ------> 81  ----->  5.55 % ------> object
72 ------> 1453  ----->  99.52 % ------> object
73 ------> 1179  ----->  80.75 % ------> object
74 ------> 1406  ----->  96.3 % ------> object


In [15]:
# get missung values percentage
for col in data.columns:
  # returns the percentage of missing values in each column higher than 15 %
  if round(data[col].isnull().sum()/data.shape[0]*100,2) > 15:
    print("Column ", col, " has more than 15 % missing values")

Column  3  has more than 15 % missing values
Column  6  has more than 15 % missing values
Column  25  has more than 15 % missing values
Column  57  has more than 15 % missing values
Column  72  has more than 15 % missing values
Column  73  has more than 15 % missing values
Column  74  has more than 15 % missing values


In [16]:
# cols to drop by index
cols_to_drop = [3,6,25,57,72,73,74]
data.drop(data.columns[cols_to_drop], axis=1, inplace=True)

In [17]:
# Select only numerical columns (int, float)
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Fill NaN with median for each numerical column
for col in numerical_cols:
    data[col] = data[col].fillna(data[col].median())

In [18]:
# Categorical: fill with mode (most frequent value)
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

In [19]:
data.isnull().sum().sum()

np.int64(0)

# check on categorical data

In [22]:
for col in categorical_cols:
    print(data[col].value_counts())
    print("#####################"*3)

2
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64
###############################################################
5
Pave    1454
Grvl       6
Name: count, dtype: int64
###############################################################
7
Reg    925
IR1    484
IR2     41
IR3     10
Name: count, dtype: int64
###############################################################
8
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: count, dtype: int64
###############################################################
9
AllPub    1459
NoSeWa       1
Name: count, dtype: int64
###############################################################
10
Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: count, dtype: int64
###############################################################
11
Gtl    1382
Mod      65
Sev      13
Name: count, dtype: int64
###############################################################
12
NAmes  

In [23]:
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])
data.head()

Unnamed: 0,0,1,2,4,5,7,8,9,10,11,...,68,69,70,71,75,76,77,78,79,80
0,1,60,3,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000


# check on outliers and handle it

In [24]:
# Function to calculate outliers using IQR
def detect_outliers_iqr(data, feature):
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)]
    return outliers

# Check outliers in each numerical column
for col in numerical_cols:
    outliers = detect_outliers_iqr(data, col)
    print(f"{col}: {len(outliers)} outliers")

0: 0 outliers
1: 103 outliers
4: 69 outliers
17: 2 outliers
18: 125 outliers
19: 7 outliers
20: 0 outliers
26: 98 outliers
34: 7 outliers
36: 167 outliers
37: 29 outliers
38: 61 outliers
43: 20 outliers
44: 2 outliers
45: 26 outliers
46: 31 outliers
47: 1 outliers
48: 82 outliers
49: 0 outliers
50: 0 outliers
51: 35 outliers
52: 68 outliers
54: 30 outliers
56: 5 outliers
59: 1 outliers
61: 5 outliers
62: 21 outliers
66: 32 outliers
67: 77 outliers
68: 208 outliers
69: 24 outliers
70: 116 outliers
71: 7 outliers
75: 52 outliers
76: 0 outliers
77: 0 outliers
80: 61 outliers


In [25]:
def cap_outliers_iqr(data, feature):
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR  
    upper_bound = Q3 + 1.5 * IQR
    data[feature] = np.where(data[feature] < lower_bound, lower_bound,
                     np.where(data[feature] > upper_bound, upper_bound, data[feature]))
    return data

# Apply capping to all numeric features (except target)
for col in numerical_cols:
    data = cap_outliers_iqr(data, col)

In [26]:
# Check outliers in each numerical column
for col in numerical_cols:
    outliers = detect_outliers_iqr(data, col)
    print(f"{col}: {len(outliers)} outliers")

0: 0 outliers
1: 0 outliers
4: 0 outliers
17: 0 outliers
18: 0 outliers
19: 0 outliers
20: 0 outliers
26: 0 outliers
34: 0 outliers
36: 0 outliers
37: 0 outliers
38: 0 outliers
43: 0 outliers
44: 0 outliers
45: 0 outliers
46: 0 outliers
47: 0 outliers
48: 0 outliers
49: 0 outliers
50: 0 outliers
51: 0 outliers
52: 0 outliers
54: 0 outliers
56: 0 outliers
59: 0 outliers
61: 0 outliers
62: 0 outliers
66: 0 outliers
67: 0 outliers
68: 0 outliers
69: 0 outliers
70: 0 outliers
71: 0 outliers
75: 0 outliers
76: 0 outliers
77: 0 outliers
80: 0 outliers


# train test split

In [27]:
x  = data.drop(columns=[80])  # Features (all columns except target)
y = data[80]  # Target variable (assuming the first column is the target)

In [28]:
x.shape , y.shape

((1460, 73), (1460,))

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

In [30]:
x_train.shape , y_train.shape

((1051, 73), (1051,))

In [31]:
x_test.shape , y_test.shape

((292, 73), (292,))

In [32]:
x_val.shape , y_val.shape

((117, 73), (117,))

# modelling and evaluation

## base model

In [33]:
lr = LinearRegression()

lr.fit(x_train, y_train)

In [34]:
lr.score(x_train, y_train)

0.9022450648908114

In [35]:
lr.score(x_val, y_val)

0.8362383709716359

In [36]:
y_pred = lr.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2_score = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("R-squared (R2 ):", r2_score)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 521844452.74529546
R-squared (R2 ): 0.8932669722226907
Mean Absolute Error (MAE): 16575.812951330416


## apply standarization or normalization

### standarization

In [37]:
std_scaler = StandardScaler()

x_train_std = std_scaler.fit_transform(x_train)
x_val_std = std_scaler.transform(x_val)
x_test_std = std_scaler.transform(x_test)

In [38]:
lr = LinearRegression()

lr.fit(x_train_std, y_train)

In [39]:
lr.score(x_train_std, y_train)

0.9022450648908114

In [40]:
lr.score(x_val_std, y_val)

0.8362383709716379

In [41]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

y_pred_std = lr.predict(x_test_std)

mse = mean_squared_error(y_test, y_pred_std)
r2_res = r2_score(y_test, y_pred_std)   # ✅ now it's the function
mae = mean_absolute_error(y_test, y_pred_std)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2_res)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 521844452.745277
R-squared (R2): 0.8932669722226945
Mean Absolute Error (MAE): 16575.81295133028


### normalization

In [42]:
mm_scaler = MinMaxScaler()

x_train_mm = mm_scaler.fit_transform(x_train)
x_val_mm = mm_scaler.transform(x_val)
x_test_mm = mm_scaler.transform(x_test)

In [43]:
lr = LinearRegression()

lr.fit(x_train_mm, y_train)

In [44]:
lr.score(x_train_mm, y_train)

0.9022450648908114

In [45]:
lr.score(x_val_mm, y_val)

0.8362383709716376

In [46]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

y_pred_mm = lr.predict(x_test_mm)

mse = mean_squared_error(y_test, y_pred_mm)
r2_res = r2_score(y_test, y_pred_mm)   
mae = mean_absolute_error(y_test, y_pred_mm)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2_res)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 521844452.74527854
R-squared (R2): 0.8932669722226941
Mean Absolute Error (MAE): 16575.8129513303


### lasso vs ridge regression

#### lasso model

In [47]:
model = Lasso(alpha=1000)
model.fit(x_train, y_train)

In [48]:
model.score(x_train, y_train)

0.8901869866321881

In [49]:
model.score(x_val, y_val)

0.8451011559346313

In [50]:
y_pred_lasso = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_lasso)
r2_res = r2_score(y_test, y_pred_lasso)
mae = mean_absolute_error(y_test, y_pred_lasso)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2_res)    
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 452912916.7368874
R-squared (R2): 0.9073655633044071
Mean Absolute Error (MAE): 15682.851612143768


#### ridge model 

In [51]:
model = Ridge(alpha=10)
model.fit(x_train, y_train)

In [52]:
model.score(x_train, y_train)

0.9020252185632682

In [53]:
model.score(x_val, y_val)

0.8369365367833936

In [54]:
y_pred_ridge = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred_ridge)
r2_res = r2_score(y_test, y_pred_ridge)
mae = mean_absolute_error(y_test, y_pred_ridge)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2_res)    
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 513483608.3548741
R-squared (R2): 0.89497701863953
Mean Absolute Error (MAE): 16476.735684886196


### polynomial features

In [55]:
poly = PolynomialFeatures(degree=4, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_val_poly = poly.transform(x_val)
x_test_poly = poly.transform(x_test)

In [56]:
model = LinearRegression()
model.fit(x_train_poly, y_train)

In [57]:
model.score(x_train_poly, y_train)

1.0

In [58]:
model.score(x_val_poly, y_val)

-2.38305147386974

In [59]:
y_pred = model.predict(x_test_poly)

mse = mean_squared_error(y_test, y_pred)
r2_res = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2_res)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 9994476916.002394
R-squared (R2): -1.0441738466002648
Mean Absolute Error (MAE): 46945.2309980132
