In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt

In [2]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")

df1 = df1.drop(columns=['sub_area'])
df2 = df2.drop(columns=['sub_area', 'row ID'])

df1_encoded = pd.get_dummies(df1, drop_first=True)
df1_encoded.drop_duplicates(inplace=True)
df2_encoded = pd.get_dummies(df2, drop_first=True)

X = df1_encoded.drop(columns=['price_doc'], axis=1)
y = df1_encoded['price_doc']

In [3]:
# imputer = SimpleImputer(strategy='mean')
# X = imputer.fit_transform(X)
# df2_encoded = imputer.fit_transform(df2_encoded)

# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X)
# df2_encoded_scaled = scaler.fit_transform(df2_encoded)


In [4]:
variance_threshold = 0.001  # Set your desired threshold
selector = VarianceThreshold(threshold=variance_threshold)
X_train_high_variance = selector.fit_transform(X)
X_test_high_variance = selector.transform(df2_encoded)

In [5]:
# Set the number of principal components
pca = PCA(n_components=15)
X_train_pca = pca.fit_transform(X_train_high_variance)
X_test_pca = pca.transform(X_test_high_variance)

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train_pca)
X_test_poly = poly_features.fit_transform(X_test_pca)


In [6]:
# Use SelectKBest with forward selection
num_features = 20  # Set your desired number of features
selector_kbest = SequentialFeatureSelector(LinearRegression(), k_features=num_features, forward=True, scoring='neg_mean_squared_error', cv=3)
X_train_kbest = selector_kbest.fit_transform(X_train_poly, y)
X_test_kbest = selector_kbest.transform(X_test_poly)


In [7]:
# Train the model using the selected features
model = LinearRegression()
model.fit(X_train_kbest, y)

In [8]:
# Make predictions on the training set for evaluation (you can adjust this part as needed)
# pred_train = model.predict(X_train_kbest)

# Print RMSE on the training set for evaluation
# rmse_train = sqrt(mean_squared_error(y, pred_train))
# print("RMSE on Training Set:", rmse_train)

# Generate predictions for the test set
# pred_test = model.predict(X_test_kbest)

# print(pred_test)

# df_sample = pd.read_csv("sample_submission.csv")
# df_sample['price_doc'] = pred_test
# df_sample.to_csv("predictions_25nov.csv", index=False)

In [9]:
# Make predictions on the training set for evaluation
pred_train = model.predict(X_train_kbest)

# Print RMSE on the training set for evaluation
rmse_train = sqrt(mean_squared_error(y, pred_train))
print("RMSE on Training Set:", rmse_train)

# Generate predictions for the test set
pred_test = model.predict(X_test_kbest)

# Display the predictions on the test set
print(pred_test)

# Read the sample submission file
df_sample = pd.read_csv("sample_submission.csv")

# Update the 'price_doc' column with the predicted values
df_sample['price_doc'] = pred_test

# Save predictions to a CSV file without the extra period at the end
df_sample.to_csv("predictions_25nov.csv", index=False)


RMSE on Training Set: 14122387.247697541
[ 5165232.37270698 10493570.87437429  5554108.97216425 ...
  8089664.48004596  8089664.48004596  8089664.48004596]
