In [518]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold

from sklearn.preprocessing import StandardScaler

In [499]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [500]:
train['MSSubClass'] = train['MSSubClass'].astype('category').cat.codes
test['MSSubClass'] = test['MSSubClass'].astype('category').cat.codes

train['OverallQual'] = train['OverallQual'].astype('category').cat.codes
test['OverallQual'] = test['OverallQual'].astype('category').cat.codes

train['OverallCond'] = train['OverallCond'].astype('category').cat.codes
test['OverallCond'] = test['OverallCond'].astype('category').cat.codes

train.drop(['Id'], axis=1, inplace=True)

In [501]:
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
train['Alley'] = train['Alley'].fillna('None')
train['MasVnrType'] = train['MasVnrType'].fillna('None')
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())
train['BsmtQual'] = train['BsmtQual'].fillna('None')
train['BsmtCond'] = train['BsmtCond'].fillna('None')
train['BsmtExposure'] = train['BsmtExposure'].fillna('None')
train['BsmtFinType1'] = train['BsmtFinType1'].fillna('None')
train['BsmtFinType2'] = train['BsmtFinType2'].fillna('None')
train['Electrical'] = train['Electrical'].fillna('SBrkr')
train['FireplaceQu'] = train['FireplaceQu'].fillna('None')
train['GarageType'] = train['GarageType'].fillna('None')
train['GarageFinish'] = train['GarageFinish'].fillna('None')
train['GarageQual'] = train['GarageQual'].fillna('None')
train['GarageCond'] = train['GarageCond'].fillna('None')
train['PoolQC'] = train['PoolQC'].fillna('None')
train['Fence'] = train['Fence'].fillna('None')
train['MiscFeature'] = train['MiscFeature'].fillna('None')
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train['GarageYrBlt'].mean())

In [502]:
test['MSZoning'] = test['MSZoning'].fillna('RL')
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].mean())
test['Alley'] = test['Alley'].fillna('None')
test['Utilities'] = test['Utilities'].fillna('AllPub')
test['Exterior1st'] = test['Exterior1st'].fillna('VinylSd')
test['Exterior2nd'] = test['Exterior2nd'].fillna('VinylSd')
test['MasVnrType'] = test['MasVnrType'].fillna('None')
test['MasVnrArea'] = test['MasVnrArea'].fillna(test['MasVnrArea'].mean())
test['BsmtQual'] = test['BsmtQual'].fillna('None')
test['BsmtCond'] = test['BsmtCond'].fillna('None')
test['BsmtExposure'] = test['BsmtExposure'].fillna('None')
test['BsmtFinType1'] = test['BsmtFinType1'].fillna('None')
test['BsmtFinType2'] = test['BsmtFinType2'].fillna('None')
test['BsmtFinSF1'] = test['BsmtFinSF1'].fillna(test['BsmtFinSF1'].mean())
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(test['BsmtFinSF2'].mean())
test['BsmtUnfSF'] = test['BsmtUnfSF'].fillna(test['BsmtUnfSF'].mean())
test['TotalBsmtSF'] = test['TotalBsmtSF'].fillna(test['TotalBsmtSF'].mean())
test['BsmtFullBath'] = test['BsmtFullBath'].fillna(test['BsmtFullBath'].mean())
test['BsmtHalfBath'] = test['BsmtHalfBath'].fillna(test['BsmtHalfBath'].mean())
test['KitchenQual'] = test['KitchenQual'].fillna('TA')
test['Functional'] = test['Functional'].fillna('Typ')
test['FireplaceQu'] = test['FireplaceQu'].fillna('None')
test['GarageType'] = test['GarageType'].fillna('None')
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean())
test['GarageFinish'] = test['GarageFinish'].fillna('None')
test['GarageCars'] = test['GarageCars'].fillna(test['GarageCars'].mean())
test['GarageArea'] = test['GarageArea'].fillna(test['GarageArea'].mean())
test['GarageQual'] = test['GarageQual'].fillna('None')
test['GarageCond'] = test['GarageCond'].fillna('None')
test['PoolQC'] = test['PoolQC'].fillna('None')
test['Fence'] = test['Fence'].fillna('None')
test['MiscFeature'] = test['MiscFeature'].fillna('None')
test['SaleType'] = test['SaleType'].fillna('Oth')

In [503]:
columns = list(train.columns.values)

objcols = []
for column in columns:
    if train[column].dtype == 'int64' or train[column].dtype == 'float64':
        objcols.append(column)
        
objcols.remove('SalePrice')

In [504]:
for objcol in objcols:
    train[objcol] = (train[objcol] - train[objcol].mean())/train[objcol].std()
    test[objcol] = (test[objcol] - test[objcol].mean())/test[objcol].std()

In [505]:
columns = list(train.columns.values)

objcols = []
for i in columns:
    if train[i].dtype == 'object':
        objcols.append(i)

In [506]:
for objcol in objcols:
    train[objcol] = train[objcol].astype('category').cat.codes
    test[objcol] = test[objcol].astype('category').cat.codes

In [507]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int8   
 1   MSZoning       1460 non-null   int8   
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   float64
 4   Street         1460 non-null   int8   
 5   Alley          1460 non-null   int8   
 6   LotShape       1460 non-null   int8   
 7   LandContour    1460 non-null   int8   
 8   Utilities      1460 non-null   int8   
 9   LotConfig      1460 non-null   int8   
 10  LandSlope      1460 non-null   int8   
 11  Neighborhood   1460 non-null   int8   
 12  Condition1     1460 non-null   int8   
 13  Condition2     1460 non-null   int8   
 14  BldgType       1460 non-null   int8   
 15  HouseStyle     1460 non-null   int8   
 16  OverallQual    1460 non-null   int8   
 17  OverallCond    1460 non-null   int8   
 18  YearBuil

In [508]:
X_train = train.drop(['SalePrice'], axis=1).copy()
y_train = train['SalePrice']
X_test = test.drop(['Id'], axis=1).copy()

X_train.shape, y_train.shape, X_test.shape

((1460, 79), (1460,), (1459, 79))

In [509]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)
 
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


95.62

In [514]:
random_forest = RandomForestClassifier(n_estimators=300)
random_forest.fit(X_train, y_train)
randomforest_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)

acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest

100.0

In [512]:
submission_1 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": randomforest_pred
    })
submission_1.to_csv("./submission_1.csv",index=False)

In [515]:
submission_2 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": logreg_preds
    })
submission_2.to_csv("./submission_2.csv",index=False)

In [521]:
n_folds = 10
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

rmse_cv(logreg).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

51641.78154924248