In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adamax
from keras.callbacks import EarlyStopping

# Load data
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Separate features and target
X = df.drop('price_doc', axis=1)
y = df['price_doc']

# Combine train and test for consistent preprocessing
combined_data = pd.concat([X, df_test], axis=0)

# One-hot encoding for categorical variables
object_cols = combined_data.select_dtypes(include=['object']).columns
combined_data = pd.get_dummies(combined_data, columns=object_cols, drop_first=True)

# Variance filter to remove low-variance features
variance_filter = VarianceThreshold(threshold=0.01)
combined_data = pd.DataFrame(variance_filter.fit_transform(combined_data), columns=combined_data.columns[variance_filter.get_support()])

# Split combined_data back into train and test
X = combined_data[:len(X)]
df_test = combined_data[len(X):]

# Feature scaling
scaler = RobustScaler()
X = scaler.fit_transform(X)
df_test = scaler.transform(df_test)

MemoryError: Unable to allocate 3.72 GiB for an array with shape (1926, 259296) and data type float64

In [None]:
# Dimensionality reduction with PCA
pca = PCA(n_components=15)
X = pca.fit_transform(X)
df_test = pca.transform(df_test)

# Polynomial features
poly = PolynomialFeatures(degree=3)
X = poly.fit_transform(X)
df_test = poly.transform(df_test)


In [None]:
# Model
n_features = X.shape[1]
model = Sequential()
model.add(Dense(150, input_dim=n_features, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(80, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(35, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
model.compile(optimizer=Adamax(learning_rate=0.01), loss='mean_squared_error')



In [None]:
# Model training
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
model.fit(X, y, epochs=100, batch_size=256, callbacks=[early_stopping])

# Prediction
y_pred = model.predict(df_test)

# Save predictions to a CSV file
result_df = pd.DataFrame({'row ID': range(1, len(y_pred) + 1), 'price_doc': y_pred.flatten()})
result_df.to_csv('neural_29nov_3_improved.csv', index=False)