In [17]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.feature_selection import SequentialFeatureSelector
import csv

In [18]:
df = pd.read_csv('train.csv')

object_columns = df.select_dtypes(include=['object']).columns

# Use get_dummies for one-hot encoding
df = pd.get_dummies(df, columns=object_columns, drop_first=True)

X = df.drop('price_doc', axis=1)
y = df['price_doc']
df_test = pd.read_csv('test.csv')
# Apply the same one-hot encoding to the test set
df_test = pd.get_dummies(df_test, columns=object_columns, drop_first=True)


In [25]:
# Make sure the columns in the test set match the columns in the training set
df_test = df_test.reindex(columns=X.columns, fill_value=0)

# Apply Incremental PCA
n_components = 30  # Adjust the number of components as needed
batch_size = 100  # Adjust the batch size as needed
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
X_pca = ipca.fit_transform(X)
df_test_pca = ipca.transform(df_test)

In [26]:
# Use SequentialFeatureSelector for forward feature selection
reg = LinearRegression()
sfs = SequentialFeatureSelector(reg, n_features_to_select=15, direction='forward')
sfs.fit(X_pca, y)

In [27]:
# Transform the datasets with the selected features
X_top_20 = sfs.transform(X_pca)
df_test_top_20 = sfs.transform(df_test_pca)

In [28]:
# Train the final model on the selected features
reg_top_20 = LinearRegression().fit(X_top_20, y)


In [29]:
# Predict on the test set
y_pred_test_top_20 = reg_top_20.predict(df_test_top_20)

In [30]:
filepath = 'prediction_top_20_features_forward_selection.csv'
with open(filepath, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['row ID', 'price_doc'])  # Add column headers
    for c, i in enumerate(y_pred_test_top_20, start=1):
        writer.writerow([c, i])