In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv('/content/data.csv')

In [4]:
# Display the first few rows of the dataset
print(data.head())

                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2014-05-02 00:00:00   313000.0       3.0       1.50         1340      7912   
1  2014-05-02 00:00:00  2384000.0       5.0       2.50         3650      9050   
2  2014-05-02 00:00:00   342000.0       3.0       2.00         1930     11947   
3  2014-05-02 00:00:00   420000.0       3.0       2.25         2000      8030   
4  2014-05-02 00:00:00   550000.0       4.0       2.50         1940     10500   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     1.5           0     0          3        1340              0      1955   
1     2.0           0     4          5        3370            280      1921   
2     1.0           0     0          4        1930              0      1966   
3     1.0           0     0          4        1000           1000      1963   
4     1.0           0     0          4        1140            800      1976   

   yr_renovated                    str

In [5]:
# Preprocessing
# Convert categorical columns to numerical ones if necessary (e.g., 'location')
if 'location' in data.columns:
    label_encoder = LabelEncoder()
    data['location'] = label_encoder.fit_transform(data['location'])

In [6]:
# check for missing values and handle them
data = data.dropna() # dropping rows with missing values

In [22]:
# feature and 'price' is the target variable
X = data[['sqft_living', 'bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']]
y = data['price']


In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Initialize the Linear Regression model
model = LinearRegression()

In [25]:
# Train the model on the training data
model.fit(X_train, y_train)

In [26]:
# Predict the prices for the test data
y_pred = model.predict(X_test)

In [27]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [28]:
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 988408597251.4451
R^2 Score: 0.030825960343034642


In [32]:
print("Predicted prices:", y_pred)

Predicted prices: [ 304559.89148819  325068.35592739 1069440.74757485  540490.06343661
  379627.01331517  599273.23261755  479347.78596903  415754.1608087
  507024.18239666  530978.15835431  674454.31471552  411788.39166424
  825346.86035728  414536.58595889  367031.61003125  702417.76853025
  679473.74304248  673135.71010706 1010573.84077922  872725.34604926
 1363273.68408337  633796.16470343  629898.57459228  468920.69575344
  162135.55163342  231193.31384184  664311.41679543  877242.41188907
  257347.38793392  971335.93446069 1869490.2075607   478635.97938703
 1278583.00227083  429546.36268729  180003.03432157  337837.67753487
  781955.37956445 1006087.01202825  234653.09930637  537039.3793855
  425382.23544961  239594.58436598  433000.2477333   374608.00839796
  317674.60557239  315567.03934141  465082.89283737  571905.85379593
  824031.72060553  454251.63336287 1205545.9709939   383098.69164824
  459671.38910299  633934.81349206  291498.16623262  351183.74813146
  969770.74549557 

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

In [67]:
# Load the dataset
data = pd.read_csv('/content/data.csv')

In [68]:
# Selecting features and target variable
features = data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
                 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
                 'city']]
target = data['price']

In [69]:
# Encoding categorical variables (city)
label_encoder = LabelEncoder()
features['city'] = label_encoder.fit_transform(features['city'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['city'] = label_encoder.fit_transform(features['city'])


In [70]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [71]:
# Initializing the decision tree regressor model
model = DecisionTreeRegressor(random_state=42)

In [72]:
# Training the model
model.fit(X_train, y_train)

In [73]:
# Making predictions
y_pred = model.predict(X_test)

In [74]:
# Evaluating the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 224769.06300766303


In [75]:
print("Predicted prices:", y_pred)

Predicted prices: [ 300000.        235000.       1411600.        572500.
  289000.        499000.        530000.        268500.
  850000.        850000.        625000.        212700.
  650000.        300000.        274950.        819000.
  533112.        806000.       1075000.        865000.
  400000.        525000.        689800.        525000.
  329445.        206000.        749000.       1256500.
  437000.        695000.        195000.        439800.
 1800000.        298450.        440000.        648475.
  740000.             0.        108333.333333  630000.
  353500.        466800.        322200.        470000.
  310000.        330000.        251555.555556  525000.
 1256500.        569000.       1555000.        500000.
  673476.818182  415000.        108333.333333  436500.
  825000.        455000.        915000.        310000.
  672500.        602000.        310000.        875000.
  883000.        300000.        566000.        565000.
  632500.        174500.        835000.        

In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


In [82]:
data = pd.read_csv('/content/data.csv')


In [83]:
# Select relevant features for the model
features = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
    'yr_built', 'yr_renovated'
]
X = data[features]
y = data['price']


In [84]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
# Initialize and train the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)

In [86]:
# Predict on the test set
y_pred = gbr.predict(X_test)

In [87]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Root Mean Squared Error (RMSE): 990492.25


In [91]:
print("Predicted prices:", y_pred )

Predicted prices: [ 419121.68280367  339749.344077   1030369.00423182  472347.59907717
  369607.6085747   578511.26405625  451655.04867718  445191.9326338
  462257.17390384  497924.08518565  659430.79981877  473008.28177954
  813586.76156298  733482.79566877  354071.14708148 1046754.57307255
  716098.59334136  573878.48152728  880268.8382072   777827.44301049
  782985.31356827  590669.53877303  600333.87234758  445994.62225145
  321656.56246385  257121.30061509  558133.81382474  913896.31537446
  416150.40648267  931229.1206311  2102367.7978396   458127.75722407
 1041081.29262209  446504.41783442  291819.54767525  385106.86433938
  727730.3443142  1002164.04491031  261147.93706971  537303.27315785
  415543.26058204  319658.63319284  469534.17475085  363498.30760121
  321751.16587799  331961.35935895  391343.09056389  601972.31024793
  933288.38362131  539999.5213529  1341894.44220734  503414.68998275
  420470.57602373  523512.23335393  237921.32009369  395129.59463417
  942713.83185603

In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [93]:
# Sample data
data = {
    'area': [1500, 2500, 1800, 3000, 1200],
    'bedrooms': [3, 4, 3, 5, 2],
    'location': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Chicago'],
    'price': [400000, 600000, 500000, 650000, 300000]
}


In [94]:
# Creating DataFrame
df = pd.DataFrame(data)


In [95]:
# Separating features and target
X = df[['area', 'bedrooms', 'location']]
y = df['price'] # target

In [96]:
# One-hot encoding for the 'location' feature
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[['location']]).toarray()


In [97]:
# Creating a DataFrame with encoded features
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['location']))


In [99]:
# Concatenating with the original features
X = pd.concat([X[['area', 'bedrooms']].reset_index(drop=True), X_encoded_df], axis=1)


In [100]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [101]:
# Creating and training the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [102]:
# Predicting the prices on the test set
y_pred = model.predict(X_test)


In [103]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5


In [104]:
print(f"Predicted prices: {y_pred}")
print(f"RMSE: {rmse}")

Predicted prices: [524500.]
RMSE: 75500.0
