In [35]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA


In [42]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

numeric_cols = [col for col in train_df.select_dtypes(include=[np.number]).columns if col != 'pSat_Pa']

X_scaled = StandardScaler().fit_transform(train_df[numeric_cols])
test_df_scaled = StandardScaler().fit_transform(test_df[numeric_cols])

pca = PCA(n_components=0.97)
X_pca = pca.fit_transform(X_scaled)
test_pca = pca.transform(test_df_scaled)

X_pca = sm.add_constant(X_pca)

train_df['log_pSat_Pa'] = np.log10(train_df['pSat_Pa'])

X_train, X_test, y_train, y_test = train_test_split(X_pca, train_df['log_pSat_Pa'], test_size=0.2, random_state=42)

glm_model = sm.GLM(y_train, X_train, family=sm.families.Gaussian())
glm_results = glm_model.fit()

glm_predictions = glm_results.predict(X_test)

glm_r2 = r2_score(y_test, glm_predictions)
print("GLM R^2 Score: ", glm_r2)

glm_test_predictions = glm_results.predict(sm.add_constant(test_pca))

glm_submission = pd.DataFrame({'Id': test_df['Id'], 'target': glm_test_predictions})

glm_submission.to_csv('glm_submission.csv', index=False)

GLM R^2 Score:  0.6923539014404261
