In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.neighbors import KNeighborsRegressor


In [28]:
#Linear regression (including polynomial and regularized regression)

In [29]:
data_train = pd.read_csv('train_clean.csv')

In [30]:
selected_features = [col for col in data_train.columns if col not in ['price']]

X = data_train[selected_features]

In [31]:
y = data_train['price']

In [32]:
categorical_features = ['Id','name','host_response_time',
    'room_type',
    'property_type',
    'neighbourhood_cleansed']

numeric_features = [col for col in X.columns if col not in categorical_features]

numeric_transformer = Pipeline(
    steps=[
    ('impute', SimpleImputer()),
    ('polynomial', PolynomialFeatures(include_bias=False)),
    ('standardize', StandardScaler())
]
)


categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
    ('select_percentile', SelectPercentile(score_func=f_regression, percentile=50))
])

In [33]:
# Define numeric transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # You can choose strategy as 'mean' or 'median'
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=1))
])

# Define categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # numeric_features should be defined
        ('cat', categorical_transformer, categorical_features)  # categorical_features should be defined
    ]
)

# Define KNeighborsRegressor model
model = KNeighborsRegressor(n_neighbors=4, weights='distance')

# Define pipeline
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=999)

In [35]:
params = {'model__n_neighbors': 4,
 'model__weights': 'distance',
 'preprocessor__num__imputer__strategy': 'mean',
 'preprocessor__num__poly__degree': 1}

pipeline.fit(X_train, y_train)

In [39]:
#Use the optimized pipeline to get predictions for the test data
#best_estimator = grid_search.best_estimator_
# Fit the best estimator to the training data
#best_estimator.fit(X_train, y_train)
yhat = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, yhat)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 193.62370448765583


In [40]:
data_test = pd.read_csv('test_clean.csv')

Unnamed: 0,Id,name,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,...,has_iron,has_shampoo,has_fire_extinguisher,has_microwave,has_dishes_and_silverware,has_bed_linens,has_refrigerator,has_workspace,has_cooking_basics,bath_count
0,PSJEN,Home in San Francisco · ★4.81 · 1 bedroom · 1 ...,170765678,within an hour,1.0,1.00,1.0,2.0,1,1,...,1,1,1,1,1,1,1,1,0,1
1,PVZW7,Rental unit in South San Francisco · 1 bedroom...,107434423,within an hour,1.0,0.97,0.0,4850.0,1,1,...,0,0,0,0,0,0,0,0,0,1
2,EJLAM,Rental unit in San Francisco · 1 bedroom · 1 b...,21994,within a few hours,0.0,0.88,1.0,15.0,1,1,...,0,0,1,1,0,0,1,1,0,35
3,SDHPB,Rental unit in San Jose · ★4.58 · 1 bedroom · ...,17827419,within an hour,1.0,0.98,0.0,2.0,1,1,...,0,0,0,0,0,0,0,0,0,1
4,MJGYX,Home in South San Francisco · ★4.59 · 3 bedroo...,22009135,within an hour,1.0,0.82,0.0,1.0,1,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6542,2TAJ9,Townhouse in La Selva Beach · ★New · 3 bedroom...,62542564,within an hour,1.0,0.96,0.0,30.0,1,1,...,0,0,0,0,0,0,0,0,0,3
6543,YFHBN,Home in San Jose · 1 bedroom · 1 bed · 1 bath,57270062,within a few hours,0.0,0.00,0.0,1.0,1,0,...,0,0,0,0,0,0,0,0,0,1
6544,Z0TCA,Home in San Jose · 1 bedroom · 1 shared bath,97774930,within a few hours,0.0,0.00,0.0,1.0,1,1,...,0,0,0,0,0,0,0,0,0,1
6545,VBCBJ,Cabin in Boulder Creek · ★4.66 · 3 bedrooms · ...,244660735,within a day,1.0,0.94,0.0,1.0,1,1,...,0,0,0,0,0,0,0,0,0,1


In [41]:
predictions = pipeline.predict(data_test)

In [42]:
results = pd.DataFrame({'Id': data_test['Id'], 'price': predictions})
results

Unnamed: 0,Id,price
0,PSJEN,185.199507
1,PVZW7,183.408228
2,EJLAM,55.258974
3,SDHPB,170.387032
4,MJGYX,479.936645
...,...,...
6542,2TAJ9,667.164356
6543,YFHBN,1134.043043
6544,Z0TCA,634.199441
6545,VBCBJ,272.820443


In [43]:
results.to_csv('predictions_with_ids.csv', index=False, header=['id', 'price'])