In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  # For train/test splits
from sklearn.neighbors import KNeighborsClassifier  # The k-nearest neighbor classifier
from sklearn.pipeline import Pipeline  # For setting up pipeline

# Various pre-processing steps
from sklearn.preprocessing import (
    Normalizer,
    StandardScaler,
    MinMaxScaler,
    MaxAbsScaler,
    FunctionTransformer,
)
from sklearn.model_selection import GridSearchCV  # For optimization
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import statsmodels.formula.api as sm

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
avg_rent = pd.read_csv("avg_rent.csv")
dist_city_center = pd.read_csv("dist_from_city_centre.csv")

In [3]:
train_df = pd.merge(train_df, avg_rent, how="left", on="location")
train_df = pd.merge(train_df, dist_city_center, how="left", on="location")

In [4]:
test_df = pd.merge(test_df, avg_rent, on="location", how="left")
test_df = pd.merge(test_df, dist_city_center, on="location", how="left")

In [5]:
train_df.isnull().sum()

ID                   0
area_type            0
availability         0
location             1
size                14
society           4428
total_sqft           0
bath                65
balcony            504
price                0
avg_2bhk_rent     6991
dist_from_city    1026
dtype: int64

In [6]:
numerical_col = train_df.select_dtypes(include=np.number).drop(["ID"], axis=1)
category_col = train_df.select_dtypes(include="object")
print(numerical_col.columns, category_col.columns)

Index(['bath', 'balcony', 'price', 'avg_2bhk_rent', 'dist_from_city'], dtype='object') Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft'],
      dtype='object')


In [7]:
train_df["location"].fillna(method="ffill", inplace=True)
train_df["size"].fillna(method="ffill", inplace=True)
train_df["society"].fillna(method="ffill", inplace=True)
train_df["bath"].fillna(train_df["bath"].mean(), inplace=True)
train_df["balcony"].fillna(train_df["balcony"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['location'].fillna(method='ffill', inplace=True)
  train_df['location'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['size'].fillna(method='ffill',inplace=True)
  train_df['size'].fillna(method='ffill',inplace=True)
The be

In [8]:
train_df.isnull().sum()

ID                   0
area_type            0
availability         0
location             0
size                 0
society              0
total_sqft           0
bath                 0
balcony              0
price                0
avg_2bhk_rent     6991
dist_from_city    1026
dtype: int64

In [9]:
train_df["avg_2bhk_rent"].fillna(train_df["avg_2bhk_rent"].mean(), inplace=True)
train_df["dist_from_city"].fillna(train_df["dist_from_city"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['avg_2bhk_rent'].fillna(train_df['avg_2bhk_rent'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['dist_from_city'].fillna(train_df['dist_from_city'].mean(), inplace=True)


In [10]:
train_df.isnull().sum()

ID                0
area_type         0
availability      0
location          0
size              0
society           0
total_sqft        0
bath              0
balcony           0
price             0
avg_2bhk_rent     0
dist_from_city    0
dtype: int64

In [11]:
train_df.head()

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price,avg_2bhk_rent,dist_from_city
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,11500.0,19.3
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,15363.440109,34.6
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,Theanmp,1440,2.0,3.0,62.0,19750.0,12.9
3,3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,15363.440109,21.4
4,4,Super built-up Area,Ready To Move,Kothanur,2 BHK,Soiewre,1200,2.0,1.0,51.0,16375.0,11.8


In [12]:
"""bhk_size = train_df['size'].apply(lambda x:x.split(' ')[0])
train_df.insert(9,'bhk_size',bhk_size)
train_df.drop(['size'],axis=1, inplace = True)
train_df.head()"""

"bhk_size = train_df['size'].apply(lambda x:x.split(' ')[0])\ntrain_df.insert(9,'bhk_size',bhk_size)\ntrain_df.drop(['size'],axis=1, inplace = True)\ntrain_df.head()"

In [13]:
def pre_process(x):
    try:
        return float(x)
    except:
        if "-" in x:
            tokens = x.split("-")
            if len(tokens) == 2:
                return (float(tokens[0]) + float(tokens[1])) / 2
        else:
            return None


train_df["total_sqft"] = train_df["total_sqft"].apply(pre_process)
train_df["total_sqft"] = pd.to_numeric(train_df["total_sqft"], errors="coerce")

In [14]:
train_df.head()

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price,avg_2bhk_rent,dist_from_city
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07,11500.0,19.3
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.0,15363.440109,34.6
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,Theanmp,1440.0,2.0,3.0,62.0,19750.0,12.9
3,3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.0,15363.440109,21.4
4,4,Super built-up Area,Ready To Move,Kothanur,2 BHK,Soiewre,1200.0,2.0,1.0,51.0,16375.0,11.8


In [15]:
X = train_df.drop(columns=["ID", "price", "society"])
y = train_df["price"]

In [16]:
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import (
    LogisticRegression,
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet,
)
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [17]:
numeric_processor = Pipeline(
    steps=[
        ("imputation", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

categorical_processor = Pipeline(
    steps=[
        (
            "impotation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("OneHotEncoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [18]:
numeric_features = np.array(
    ["total_sqft", "bath", "balcony", "dist_from_city", "avg_2bhk_rent"]
)

categorical_features = ["area_type", "availability", "location", "size"]

In [19]:
pre_processing = ColumnTransformer(
    [
        ("numerical", numeric_processor, numeric_features),
        ("categorical", categorical_processor, categorical_features),
    ]
)
pre_processing

In [20]:
from sklearn.pipeline import make_pipeline

In [21]:
final_pipe_lasso = make_pipeline(pre_processing, Lasso(alpha=0.2, random_state=42))
final_pipe_lasso

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

In [23]:
final_pipe_lasso.fit(X_train, y_train)

In [24]:
final_pipe_linear = make_pipeline(pre_processing, LinearRegression())

In [25]:
final_pipe_linear.fit(X_train, y_train)
y_pred_linear = final_pipe_linear.predict(X_test)
final_pipe_linear

In [26]:
final_pipe_ridge = make_pipeline(pre_processing, Ridge(alpha=0.2, random_state=42))
final_pipe_ridge.fit(X_train, y_train)
y_pred_ridge = final_pipe_ridge.predict(X_test)
final_pipe_ridge

In [27]:
final_pipe_elastic = make_pipeline(pre_processing, ElasticNet())
final_pipe_elastic.fit(X_train, y_train)
y_Pred_elastic = final_pipe_elastic.predict(X_test)
final_pipe_elastic

In [28]:
y_pred_lasso = final_pipe_lasso.predict(X_test)

In [29]:
y_pred = final_pipe_lasso.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Validation RMSE using Lasso: {rmse}")

Validation RMSE using Lasso: 102.37119635315706


In [30]:
test_df.head()

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,avg_2bhk_rent,dist_from_city
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,,650,1.0,1.0,15875.0,6.7
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,SrncyRe,1370,2.0,1.0,,11.0
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,AjhalNa,1725,3.0,2.0,14981.0,17.3
3,3,Built-up Area,Ready To Move,Jalahalli,2 BHK,,1000,2.0,0.0,11000.0,16.6
4,4,Plot Area,Ready To Move,TC Palaya,1 Bedroom,,1350,1.0,0.0,,12.2


In [31]:
def pre_process(x):
    try:
        return float(x)
    except:
        if "-" in x:
            tokens = x.split("-")
            if len(tokens) == 2:
                return (float(tokens[0]) + float(tokens[1])) / 2
        else:
            return None


test_df["total_sqft"] = train_df["total_sqft"].apply(pre_process)
test_df["total_sqft"] = pd.to_numeric(test_df["total_sqft"], errors="coerce")

In [32]:
X_test = test_df.drop(columns=["ID", "society"])
test_df["price"] = final_pipe_lasso.predict(X_test)

In [33]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print(f"Validation RMSE using Lasso: {rmse}")

Validation RMSE using Lasso: 102.37119635315706


In [34]:
submission_df = test_df[["ID", "price"]]
submission_df.to_csv("Lasso_reg_submission.csv", index=False)

In [35]:
result = pd.read_csv("Lasso_reg_submission.csv")

In [36]:
result.head()

Unnamed: 0,ID,price
0,0,55.390231
1,1,130.409241
2,2,97.828031
3,3,60.072572
4,4,82.7731


In [37]:
X_test = test_df.drop(columns=["ID", "price"])
test_df["price"] = final_pipe_linear.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print(f"Validation RMSE using Linear: {rmse}")

Validation RMSE using Linear: 106.23918890692856


In [38]:
submission_df = test_df[["ID", "price"]]
submission_df.to_csv("Linear_reg_submission.csv", index=False)
result1 = pd.read_csv("Linear_reg_submission.csv")
result1.head()

Unnamed: 0,ID,price
0,0,184.045418
1,1,111.616473
2,2,102.766933
3,3,59.258397
4,4,32.8303


In [39]:
X_test = test_df.drop(columns=["ID", "price"])
test_df["price"] = final_pipe_elastic.predict(X_test)

rmse - np.sqrt(mean_squared_error(y_test, y_Pred_elastic))
print(f"validation of elasticnet : {rmse}")

validation of elasticnet : 106.23918890692856


In [40]:
submission_df = test_df[["ID", "price"]]
submission_df.to_csv("elastic_reg_submission.csv", index=False)

In [41]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print(f"Validation RMSE using ridge: {rmse}")

X_test = test_df.drop(columns=["ID", "society"])
test_df["price"] = final_pipe_ridge.predict(X_test)


submission_df = test_df[["ID", "price"]]
submission_df.to_csv("ridge_reg_submission.csv", index=False)

Validation RMSE using ridge: 105.26533856224576


In [42]:
final_pipe_grad = make_pipeline(
    pre_processing, GradientBoostingRegressor(n_estimators=100, random_state=42)
)

final_pipe_grad.fit(X_train, y_train)

y_pred = final_pipe_grad.predict(X_test)
final_pipe_grad

In [43]:
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# print(f'Validation RMSE using ridge: {rmse}')

X_test = test_df.drop(columns=["ID", "society"])
test_df["price"] = final_pipe_grad.predict(X_test)


submission_df = test_df[["ID", "price"]]
submission_df.to_csv("grad_reg_submission.csv", index=False)

In [44]:
final_pipe_random = make_pipeline(
    pre_processing, RandomForestRegressor(n_estimators=100, random_state=42)
)

final_pipe_random.fit(X_train, y_train)

y_pred_random = final_pipe_random.predict(X_test)
final_pipe_random

In [46]:
X_test = test_df.drop(columns=["ID", "society"])
test_df["price"] = final_pipe_random.predict(X_test)


submission_df = test_df[["ID", "price"]]
submission_df.to_csv("random_reg_submission.csv", index=False)

In [47]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_random))
print(f"Validation RMSE using ridge: {rmse}")

ValueError: Found input variables with inconsistent numbers of samples: [1599, 2664]