<a href="https://colab.research.google.com/github/LamiaAlariqi/shi_task/blob/main/Task6HW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import os
import tarfile
import urllib

In [None]:

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)

In [None]:
fetch_housing_data()
housing = load_housing_data()

In [None]:
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

In [None]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [None]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [None]:
num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [None]:
#doing process on train and test data

In [None]:
train_set_cat = train_set[['ocean_proximity']]
test_set_cat = test_set[['ocean_proximity']]
ohe_train = OneHotEncoder()
ohe_test = OneHotEncoder()
ohe_train_cat = ohe_train.fit_transform(train_set_cat)
ohe_test_cat = ohe_test.fit_transform(test_set_cat)

arr_ohe_train_cat = ohe_train_cat.toarray()
arr_ohe_test_cat = ohe_test_cat.toarray()

df_ohe_train_cat = pd.DataFrame(arr_ohe_train_cat)
df_ohe_test_cat = pd.DataFrame(arr_ohe_test_cat)

train_set.drop('ocean_proximity',axis = 1,inplace = True)
test_set.drop('ocean_proximity',axis = 1, inplace = True)
train_set.index = np.arange(0,16512)
test_set.index = np.arange(0,4128)
train_set_full = train_set.join(df_ohe_train_cat)
test_set_full = test_set.join(df_ohe_test_cat)

Trained using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_set_full.values,housing_labels)

Trying this code out on a few data

In [None]:
some_data = test_set_full.iloc[:5]
some_labels = housing_labels.iloc[:5]


In [None]:
some_data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        5
population            0
households            0
median_income         0
median_house_value    0
0                     0
1                     0
2                     0
3                     0
4                     0
dtype: int64

In [None]:
some_data.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  some_data.fillna(0,inplace=True)


In [None]:
predictions = lin_reg.predict(some_data.values)
print(predictions)

[ 47700.  45800. 500001. 218600. 278000.]


measure this regression model’s RMSE on the whole training set

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
housing_predictions = lin_reg.predict(train_set_full.values)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

print("RMSE on training set:", rmse)

RMSE on training set: 6.515974711844028e-11


judge on the RMSE result for this model:

I think the high value of RMSE indicates inaccurate predictions of the target values by the model on the training set. This means that there is a significant difference between the actual values and the predictions provided by the model

In [None]:
#Let’s train a Decision Tree Regressor model

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state = 42)
tree_reg.fit(train_set_full.values,housing_labels)

Now evaluate the model on the training set
using Scikit-Learn’s mean_squared_error() function:

In [None]:
housing_predictions = tree_reg.predict(train_set_full.values)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

Explaine this result:
In this case, where the model can have learned excessively from the data and suffers from the problem of overfitting.

In [None]:
#Evaluation Using Cross-Validation
#1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model
from sklearn.model_selection import cross_val_score
train_set_full.columns = train_set_full.columns.astype(str)
scores = cross_val_score(tree_reg, train_set_full, housing_labels, cv=10, scoring='neg_mean_squared_error')

# Calculate the mean and standard deviation of the RMSE scores

In [None]:
rmse_scores = np.sqrt(-scores)
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)
print(mean_rmse)
print(std_rmse)

145.37412837767604
71.67629955871152


repaet the same steps to compute the same scores for the Linear Regression model

notice the difference between the results of the two models

In [None]:
scores_lin = cross_val_score(lin_reg, train_set_full, housing_labels, cv=10, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores_lin)
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)
print(mean_rmse)
print(std_rmse)

5.63018433705339e-11
2.3897247399865758e-11


Let’s train one last model the RandomForestRegressor.

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(train_set_full, housing_labels)

repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [None]:
train_set_full.columns = train_set_full.columns.astype(str)

scores = cross_val_score(forest_reg, train_set_full, housing_labels, cv=10, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)

# Calculate the mean and standard deviation of the RMSE scores
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

# Print the mean and standard deviation of the RMSE scores
print("Mean RMSE:", mean_rmse)
print("Standard Deviation of RMSE:", std_rmse)

Mean RMSE: 77.53373133285115
Standard Deviation of RMSE: 64.58628952662657


Save every model you experiment with

In [None]:
import joblib
joblib.dump(forest_reg, 'random_forest_model.pkl')
joblib.dump(lin_reg, 'linear_regression_model.pkl')
joblib.dump(tree_reg, 'decision_tree_model.pkl')

['decision_tree_model.pkl']

In [None]:
forest_reg.get_params

In [None]:
train_set.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

now you have a shortlist of promising models. You now need to
fine-tune them!
Fine-Tune Your Model
1- Grid Search
evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the RandomForestRegressor
forest_reg = RandomForestRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=forest_reg,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available CPU cores
    verbose=2  # Increase verbosity
)

In [None]:
train_set_full.drop(["median_house_value"],axis = 1, inplace = True) ####################

In [None]:
# Perform the grid search
grid_search.fit(train_set_full, housing_labels)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [None]:
#with the evaluation scores

cv_results = grid_search.cv_results_

# Print the evaluation scores for each hyperparameter combination
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print("Mean RMSE:", np.sqrt(-mean_score))
    print("Hyperparameters:", params)
    print("---")

Mean RMSE: 49348.165034415215
Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
---
Mean RMSE: 49134.25456081725
Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
---
Mean RMSE: 49152.93402472013
Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
---
Mean RMSE: 49451.27761103618
Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
---
Mean RMSE: 49220.67291471452
Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
---
Mean RMSE: 49212.9460425303
Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
---
Mean RMSE: 49589.822466248734
Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
---
Mean RMSE: 49655.877113815455
Hy

Analyze the Best Models and Their Errors
1-indicate the relative importance of each attribute

In [None]:
best_model = grid_search.best_estimator_
importances = best_model.feature_importances_
feature_names = list(train_set_full.columns)
sorted_indices = np.argsort(importances)[::-1]

display these importance scores next to their corresponding attribute names:

In [None]:
# Print the feature names and their corresponding importances
for i in sorted_indices:
    print(feature_names[i], ":", importances[i])

median_income : 0.4984689095333678
1 : 0.14547070838959686
longitude : 0.10615038609567272
latitude : 0.10162091862179835
housing_median_age : 0.0511305287252178
population : 0.029587578030545635
total_rooms : 0.021226141371798873
total_bedrooms : 0.02046493322811258
households : 0.015656349991654796
4 : 0.006038352832212843
0 : 0.0033859969895288656
3 : 0.0007509742918454006
2 : 4.8221898647210375e-05


In [None]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


Now is the time to evaluate the final model on the test set.
Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set and run your full_pipeline to transform the data

In [None]:
test_set_predictors= test_set.drop("median_house_value", axis=1)
test_set_labels = test_set["median_house_value"].copy()

In [None]:
test_set_predictors.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [None]:
test_set_predictors.fillna(0,inplace=True)

In [None]:
transformed_test_set = full_pipeline.transform(test_set_predictors)

In [None]:
test_set_cat = test_set_predictors[['ocean_proximity']]
ohe_test = OneHotEncoder()
ohe_test_cat = ohe_test.fit_transform(test_set_cat)
arr_ohe_test_cat = ohe_test_cat.toarray()
df_ohe_test_cat = pd.DataFrame(arr_ohe_test_cat)
test_set_predictors.drop('ocean_proximity',axis = 1, inplace = True)
test_set_predictors.index = np.arange(0, 4128 )
test_set_predictors_full = test_set_predictors.join(df_ohe_test_cat)

In [None]:
test_set_predictors_full

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,0,1,2,3,4
0,-119.01,36.06,25.0,1505.0,0.0,1392.0,359.0,1.6812,0.0,1.0,0.0,0.0,0.0
1,-119.46,35.14,30.0,2943.0,0.0,1565.0,584.0,2.5313,0.0,1.0,0.0,0.0,0.0
2,-122.44,37.80,52.0,3830.0,0.0,1310.0,963.0,3.4801,0.0,0.0,0.0,1.0,0.0
3,-118.72,34.28,17.0,3051.0,0.0,1705.0,495.0,5.7376,1.0,0.0,0.0,0.0,0.0
4,-121.93,36.62,34.0,2351.0,0.0,1063.0,428.0,3.7250,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4123,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,1.0,0.0,0.0,0.0,0.0
4124,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,0.0,0.0,0.0,0.0,1.0
4125,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,1.0,0.0,0.0,0.0,0.0
4126,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,0.0,1.0,0.0,0.0,0.0


In [None]:
test_set_predictors_full.shape

(4128, 13)

-evaluate the final model on the test set

In [None]:
from sklearn.metrics import mean_squared_error
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(test_set_predictors_full.values)
final_mse = mean_squared_error(test_set_labels.values,final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse



48884.20412744111

compute a 95% confidence interval for the generalization error
using scipy.stats.t.interval():

In [None]:
import scipy.stats as stats

# Assuming you have computed the generalization error
generalization_error = 0.1

# Compute the sample size
sample_size = len(transformed_test_set)

# Compute the degrees of freedom
degrees_of_freedom = sample_size - 1

# Compute the standard error
standard_error = generalization_error / np.sqrt(sample_size)

# Compute the confidence interval
confidence_interval = stats.t.interval(0.95, degrees_of_freedom, loc=generalization_error, scale=standard_error)

# Print the confidence interval
print("95% Confidence Interval:", confidence_interval)

95% Confidence Interval: (0.09694855438230848, 0.10305144561769153)
