<a href="https://colab.research.google.com/github/IlyasNasirov/notebook/blob/main/Normativ3.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/IlyasNasirov/datasets/refs/heads/main/housing.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

In [3]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].mean())


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
x = df.drop('median_house_value', axis=1)
y = df['median_house_value']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
numerical_features = x.select_dtypes(include='float64').columns
print(numerical_features)
string_features = x.select_dtypes(include='object').columns
print(string_features)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')
Index(['ocean_proximity'], dtype='object')


In [7]:
sc = StandardScaler()
ohe = OneHotEncoder()

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', sc, numerical_features),
        ('cat', ohe, string_features)
    ])

In [9]:
lr = LinearRegression()

In [10]:
model = make_pipeline(preprocessor, lr)

In [11]:
model.fit(x_train, y_train)

In [12]:
preds = model.predict(x_test)

In [13]:
import numpy as np
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
print('RMSE',rmse)

RMSE 70031.41991955665


In [14]:
import joblib
joblib.dump(model, 'lr_house.joblib')

['lr_house.joblib']

In [15]:
results = pd.DataFrame({'Real_price.value': y_test, 'Predicted': preds, 'Difference': abs(y_test.values-preds)})
results.head()

Unnamed: 0,Real_price.value,Predicted,Difference
20046,47700.0,64629.450798,16929.450798
3024,45800.0,134799.340836,88999.340836
15663,500001.0,266063.381391,233937.618609
20484,218600.0,278576.433468,59976.433468
9814,278000.0,273343.436715,4656.563285


In [16]:
new_house = pd.DataFrame({
    'longitude': [-117.03],
    'latitude': [32.71],
    'housing_median_age': [2],
    'total_rooms': [4000],
    'total_bedrooms': [1000],
    'population': [3000],
    'households': [750],
    'median_income': [5],
    'ocean_proximity': ['NEAR BAY']
})

In [17]:
loaded_model = joblib.load('lr_house.joblib')
new_price = loaded_model.predict(new_house)
print('Predicted price:', new_price[0])

Predicted price: 227077.7773797955


In [18]:
df['ocean_proximity'].value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,9136
INLAND,6551
NEAR OCEAN,2658
NEAR BAY,2290
ISLAND,5
