In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Load the training and testing datasets
train_dt = pd.read_csv('train.csv')
test_dt = pd.read_csv('test.csv')



In [2]:
# Remove ids, the only feature not needed
train_dt = train_dt.drop('Id', axis=1)

original_ids = test_dt['Id']
test_dt = test_dt.drop('Id', axis=1)

# X contains features, and y contains the target variable
X = train_dt.drop('SalePrice', axis=1)
y = train_dt['SalePrice']

# Identify the columns present in X but not in test_dt
extra_columns = set(X.columns) - set(test_dt.columns)

# Drop the extra columns from train_dt
X = X.drop(extra_columns, axis=1)

print("Number of columns for X:", X.shape[1])
print("Number of columns for test_dt:", test_dt.shape[1])


Number of columns for X: 79
Number of columns for test_dt: 79


In [3]:
# Combine train and test datasets for on-hot encoding
combined_data = pd.concat([X, test_dt], axis=0, ignore_index=True)

# Identify columns with non-numerical values in the combined dataset
categorical_columns = combined_data.select_dtypes(include=['object']).columns

# Apply one-hot encoding to the combined dataset
combined_data = pd.get_dummies(combined_data, columns=categorical_columns, prefix=categorical_columns)

# Split the combined data back into train and test datasets
X = combined_data.iloc[:len(X)]
test_dt = combined_data.iloc[len(train_dt):]

print("Number of columns for X:", X.shape[1])
print("Number of columns for test_dt:", test_dt.shape[1])

Number of columns for X: 287
Number of columns for test_dt: 287


In [4]:
# Identify columns with missing values from combined dataset
missing_columns = combined_data.columns[combined_data.isna().any()].tolist()

# Make deep copies of the DataFrames
X_copy = X.copy(deep=True)
test_dt_copy = test_dt.copy(deep=True)

# Replace null values with mean
imputer = SimpleImputer(strategy='mean') 


X_copy.loc[:, missing_columns] = imputer.fit_transform(X_copy.loc[:, missing_columns])
test_dt_copy.loc[:, missing_columns] = imputer.transform(test_dt_copy.loc[:, missing_columns])

X = X_copy
test_dt = test_dt_copy

print("Number of columns for X:", X.shape[1])
print("Number of columns for test_dt:", test_dt.shape[1])

Number of columns for X: 287
Number of columns for test_dt: 287


In [5]:
#Scale data before training
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
test_dt_scaled = scaler.fit_transform(test_dt)

# Convert the scaled NumPy array back to a Pandas DataFrame
X = pd.DataFrame(X_scaled, columns=X.columns)
test_dt = pd.DataFrame(test_dt_scaled, columns=test_dt.columns)

#Show first few rows of X to visualize data
X.head(10)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.235294,0.150685,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.0,0.173281,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.235294,0.160959,0.046507,0.666667,0.5,0.934783,0.866667,0.10125,0.086109,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.294118,0.133562,0.038561,0.666667,0.5,0.311594,0.333333,0.0,0.038271,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.235294,0.215753,0.060576,0.777778,0.5,0.927536,0.833333,0.21875,0.116052,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.176471,0.219178,0.059899,0.444444,0.5,0.876812,0.75,0.0,0.129695,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,0.0,0.184932,0.041057,0.777778,0.5,0.956522,0.916667,0.11625,0.242558,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.235294,0.167979,0.04245,0.666667,0.625,0.731884,0.383333,0.15,0.152197,0.02171,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.176471,0.10274,0.022529,0.666667,0.5,0.427536,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.099315,0.028605,0.444444,0.625,0.485507,0.0,0.0,0.15078,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
# Split training dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


# Create a Logistic Regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Make predictions on the validation set
val_predictions = model.predict(X_val)


# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val, val_predictions)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_val, val_predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2)
r2 = r2_score(y_val, val_predictions)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared (R2): {r2}')

Mean Absolute Error: 31211.714611872147
Mean Squared Error: 2746825850.5547943
Root Mean Squared Error: 52410.169342931855
R-squared (R2): 0.6063644392498031


In [8]:
result = model.predict(test_dt)

# Create a DataFrame with the predictions
result_df = pd.DataFrame({'Id': original_ids, 'SalePrice': result})

# Save the DataFrame to a CSV file in the same directory
result_df.to_csv('result.csv', index=False)

result_df.head(4)

Unnamed: 0,Id,SalePrice
0,1461,125000
1,1462,110000
2,1463,192000
3,1464,184000
