In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
import csv



In [2]:

df = pd.read_csv('train.csv') 


object_columns = df.select_dtypes(include=['object']).columns

# Use get_dummies for one-hot encoding
df = pd.get_dummies(df, columns=object_columns, drop_first=True)

X = df.drop('price_doc', axis=1)
y = df['price_doc']

In [3]:

df_test = pd.read_csv('test.csv')
# Apply the same one-hot encoding to the test set
df_test = pd.get_dummies(df_test, columns=object_columns, drop_first=True)


# Make sure the columns in the test set match the columns in the training set
df_test = df_test.reindex(columns=X.columns, fill_value=0)

In [4]:
# Apply Incremental PCA
n_components = 10  # Adjust the number of components as needed
batch_size = 50  # Adjust the batch size as needed
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
X_pca = ipca.fit_transform(X)
df_test_pca = ipca.transform(df_test)

In [5]:

reg = LinearRegression().fit(X_pca, y)

# Get the column names of the top 20 features based on PCA components
top_20_columns = X.columns[ipca.components_.argsort(axis=1)[:, -20:][:, ::-1].ravel()]

# Select the top 20 features for both the training and test sets
X_top_20 = X[top_20_columns]
df_test_top_20 = df_test[top_20_columns]


In [6]:
# Train the model on the top 20 features
reg_top_20 = LinearRegression().fit(X_top_20, y)
y_pred_test_top_20 = reg_top_20.predict(df_test_top_20)

filepath = 'prediction_top_20_features_pca.csv'
with open(filepath, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['row ID', 'price_doc'])  # Add column headers
    for c, i in enumerate(y_pred_test_top_20, start=1):
        writer.writerow([c, i])