In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import f_regression
import statsmodels.api as sm
import csv
import numpy as np

# Load the training data
df = pd.read_csv('train.csv')

# Handle 'sub_area' column
category = df['sub_area'].value_counts()
others_col = category[category < 1000].index.tolist()
df['sub_area'] = df['sub_area'].replace(others_col, 'others')

# Perform dummy encoding
df = pd.get_dummies(df, drop_first=True)

# Separate features (X) and target variable (y)
X = df.drop(['price_doc'], axis=1)
y = df['price_doc']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)



In [5]:
import pandas as pd

# Assuming X is a NumPy array, convert it to a DataFrame
X = pd.DataFrame(X)  # Replace X_columns with the actual column names

# Use statsmodels to perform OLS regression and get p-values
X_ols = sm.add_constant(X)  # Add a constant term for the intercept
ols_model = sm.OLS(y, X_ols).fit()

# Get p-values for each feature
p_values = ols_model.pvalues[1:]  # Exclude the constant term

# Select the top 30 features based on p-values
top_features_names = list(p_values.sort_values().head(30).index)

# Extract the selected features from the original DataFrame X
X_selected = X.loc[:, top_features_names]



In [6]:
X_selected

Unnamed: 0,354,351,344,359,348,401,389,343,256,355,...,321,363,110,144,208,304,282,147,266,102
0,-0.076097,-0.076206,-0.076462,-0.077079,-0.077115,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,-0.414391,-0.407479,-0.335876,-0.07617,-0.076133,-0.675899,-0.07525,-0.419930
1,-0.076097,-0.076206,-0.076462,-0.077079,-0.077115,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,-0.589901,-0.407479,-0.251902,-0.07617,-0.076133,0.254320,-0.07525,-0.606944
2,-0.076097,-0.076206,-0.076462,-0.077079,-0.077115,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,-0.530935,-0.407479,-0.503823,-0.07617,-0.076133,-0.790128,-0.07525,-0.586413
3,-0.076097,-0.076206,-0.076462,-0.077079,-0.077115,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,-0.653523,0.636795,0.587833,-0.07617,-0.076133,-0.869655,-0.07525,-0.515697
4,-0.076097,-0.076206,-0.076462,-0.077079,-0.077115,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,-0.568333,-0.407479,0.587833,-0.07617,-0.076133,-1.030635,-0.07525,-0.774598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181502,-0.076097,-0.076206,-0.076462,-0.077079,12.967592,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,2.003325,-0.407479,-1.175611,-0.07617,-0.076133,2.425634,-0.07525,2.017905
181503,-0.076097,-0.076206,-0.076462,-0.077079,12.967592,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,2.003325,-0.407479,-1.175611,-0.07617,-0.076133,2.425634,-0.07525,2.017905
181504,-0.076097,-0.076206,-0.076462,-0.077079,12.967592,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,2.003325,-0.407479,-1.175611,-0.07617,-0.076133,2.425634,-0.07525,2.017905
181505,-0.076097,-0.076206,-0.076462,-0.077079,12.967592,-0.428284,-0.078121,-0.075545,-0.531788,-0.078264,...,-0.075913,-0.077799,2.003325,-0.407479,-1.175611,-0.07617,-0.076133,2.425634,-0.07525,2.017905


In [8]:
# Use PolynomialFeatures to create polynomial features
degree = 2
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X_selected)

# Fit a linear regression model on the polynomial features
reg = LinearRegression().fit(X_poly, y)

In [14]:
df_test = pd.read_csv('test.csv')

# Drop unnecessary column from the test set
df_test = df_test.drop(['row ID'], axis=1)

# Handle 'sub_area' column in the test set
df_test['sub_area'] = df_test['sub_area'].replace(others_col, 'others')

# Perform dummy encoding for the test set
df_test = pd.get_dummies(df_test, drop_first=True)

# Standardize the test set features
df_test = scaler.transform(df_test)  # Assuming you have a separate scaler for the test set

# Extract the selected features for the test set using p-value filter
X_test_selected = df_test[:, top_features_names]

# Use PolynomialFeatures to create polynomial features for the test set
df_test_poly = poly.transform(X_test_selected)


In [15]:
# Predict on the test set using the polynomial features
y_pred_test = reg.predict(df_test_poly)

# Save the predictions to a CSV file
filepath = 'prediction_pvalue_filter_poly.csv'
with open(filepath, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['row ID', 'price_doc'])  # Add column headers
    for c, i in enumerate(y_pred_test, start=1):
        writer.writerow([c, i])