## Exercise 6: Choosing the best performing model on a dataset




In [277]:
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

## Dataset File

In [278]:
train_data = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/3fd7d51ffd17863598ac3f44eeefc558171a5b73/dataset/house-prices-advanced-regression-techniques/train.csv?raw=true'
df = pd.read_csv(train_data)

## Test File

In [279]:
test_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/3fd7d51ffd17863598ac3f44eeefc558171a5b73/dataset/house-prices-advanced-regression-techniques/test.csv?raw=true'
dt=pd.read_csv(test_url)

In [280]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

## Sample Submission File

In [281]:
sample_submission_url ='https://github.com/robitussin/CCMACLRL_EXERCISES/blob/3fd7d51ffd17863598ac3f44eeefc558171a5b73/dataset/house-prices-advanced-regression-techniques/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [282]:
sf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1459 non-null   int64  
 1   SalePrice  1459 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 22.9 KB


In [283]:
y = df['SalePrice']


all_data = pd.concat((df.drop(['SalePrice'], axis=1), dt))



train_ids = df['Id']

test_ids = dt['Id']

all_data = all_data.drop('Id', axis=1)



for col in all_data.select_dtypes(include=['int64', 'float64']).columns:

    all_data[col] = all_data[col].fillna(all_data[col].median())


for col in all_data.select_dtypes(include=['object']).columns:

    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])



all_data = pd.get_dummies(all_data, drop_first=True)



X = all_data.iloc[:len(df)]

X_submission = all_data.iloc[len(df):]



x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Preprocessing complete.")

print(f"Training features shape: {x_train.shape}")

print(f"Testing features shape: {x_test.shape}")


Preprocessing complete.
Training features shape: (1168, 244)
Testing features shape: (292, 244)


In [284]:
score_list = {}

## 1. Train a KNN Regressor

In [285]:
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(x_train, y_train)

knn_score = knn.score(x_test, y_test)
print(f"{knn_score:.4f}")

0.6556


- Perform cross validation

In [286]:
cv_scores_knn = cross_val_score(knn, X, y, cv=5, scoring='r2')
print(f"{cv_scores_knn}")
print(f"{cv_scores_knn.mean():.4f}")

[0.6789866  0.69726968 0.62080505 0.66948637 0.59813612]
0.6529


## 2. Train a SVM Regression

In [287]:
svr = SVR(kernel='linear')
svr.fit(x_train, y_train)

svr_score = svr.score(x_test, y_test)
print(f"{svr_score:.4f}")

0.7899


- Perform cross validation

In [288]:
cv_scores_svr = cross_val_score(svr, X, y, cv=5, scoring='r2')

print(f"{cv_scores_svr}")
print(f"{cv_scores_svr.mean():.4f}")

[0.80069499 0.75164891 0.78137796 0.77900434 0.55889889]
0.7343


## 3. Train a Decision Tree Regression

In [289]:
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(x_train, y_train)

dtr_score = dtr.score(x_test, y_test)
print(f"{dtr_score:.4f}")

0.7649


- Perform cross validation

In [290]:
cv_scores_dtr = cross_val_score(dtr, X, y, cv=5, scoring='r2')

print(f"{cv_scores_dtr}")
print(f"{cv_scores_dtr.mean():.4f}")

[0.70996779 0.68245144 0.84307131 0.78834496 0.69698834]
0.7442


## 4. Train a Random Forest Regression

In [291]:
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(x_train, y_train)

rfr_score = rfr.score(x_test, y_test)
print(f"{rfr_score:.4f}")

0.8921


## 5. Compare all the performance of all regression models

In [296]:
model_performance = {
    'Random Forest': cv_scores_rfr.mean(),
    'SVR': cv_scores_svr.mean(),
    'KNN Regressor': cv_scores_knn.mean(),
    'Decision Tree': cv_scores_dtr.mean()
}

performance_df = pd.DataFrame(list(model_performance.items()), columns=['Model', 'Average CV R^2 Score'])
performance_df = performance_df.sort_values(by='Average CV R^2 Score', ascending=False)

print("Model Performance Comparison (based on Average 5-Fold CV Score):")
print(performance_df)


Model Performance Comparison (based on Average 5-Fold CV Score):
           Model  Average CV R^2 Score
0  Random Forest              0.835924
3  Decision Tree              0.744165
1            SVR              0.734325
2  KNN Regressor              0.652937


## 6. Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [293]:

test_ids = dt['Id']

all_data = pd.concat([df.drop(['SalePrice', 'Id'], axis=1), dt.drop('Id', axis=1)], ignore_index=True)

print("Training and test data combined.")
print("Combined data shape:", all_data.shape)

Training and test data combined.
Combined data shape: (2919, 79)


In [294]:
best_model = rfr

predictions = best_model.predict(X_submission)

submission_df = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})

submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully: submission.csv")
submission_df.head()

Submission file created successfully: submission.csv


Unnamed: 0,Id,SalePrice
0,1461,127819.5
1,1462,152969.0
2,1463,179443.53
3,1464,187984.85
4,1465,205530.0
