In [52]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import csv

df = pd.read_csv('train.csv') 
object_columns = df.select_dtypes(include=['object']).columns

# Use get_dummies for one-hot encoding
df = pd.get_dummies(df, columns=object_columns, drop_first=True)

X = df.drop('price_doc', axis=1)
y = df['price_doc']

In [53]:
df_test = pd.read_csv('test.csv')
# Apply the same one-hot encoding to the test set
df_test = pd.get_dummies(df_test, columns=object_columns, drop_first=True)

df_test = df_test.drop(['row ID'], axis=1)

# Make sure the columns in the test set match the columns in the training set
df_test = df_test.reindex(columns=X.columns, fill_value=0)

reg = LinearRegression().fit(X, y)

In [54]:
# Get feature importances
feature_importances = reg.coef_

# Get the column names of the top 20 features
top_20_columns = X.columns[feature_importances.argsort()[-20:][::-1]]
top_20_columns

Index(['sub_area_Poselenie Krasonpahorskoe', 'sub_area_Poselenie ePrvomajskoe',
       'sub_area_Poselenie Vosrkesenskoe', 'sub_area_Tropaervo-Nikulino',
       'sub_area_Vsotochnoe Degunino', 'sub_area_Severnoe Tushnio',
       'sub_area_Poselenie Vnkuovskoe', 'sub_area_Poeslenie Filimonkovskoe',
       'sub_area_Birjulveo Vostochnoe', 'sub_area_Chertanoov Severnoe',
       'sub_area_Zjablkiovo', 'sub_area_Poselenie Filiomnkovskoe',
       'sub_area_Poselenei Novofedorovskoe', 'sub_area_Psoelenie Desjonovskoe',
       'sub_area_Prospekt Vernadskoog', 'sub_area_Zapadnoe Degunion',
       'sub_area_Poeslenie Klenovskoe', 'sub_area_Jraoslavskoe',
       'sub_area_Poselenie Pevromajskoe', 'sub_area_Poseleni eVoronovskoe'],
      dtype='object')

In [55]:
# Select the top 20 features for both the training and test sets
X_top_20 = X[top_20_columns]
df_test_top_20 = df_test[top_20_columns]



In [56]:
# Train the model on the top 20 features
reg_top_20 = LinearRegression().fit(X_top_20, y)
y_pred_test_top_20 = reg_top_20.predict(df_test_top_20)

In [57]:
filepath = 'prediction_top_20_features.csv'
with open(filepath, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['row ID', 'price_doc'])  # Add column headers
    for c, i in enumerate(y_pred_test_top_20, start=1):
        writer.writerow([c, i])