In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore')

----

In [3]:
train_data = pd.read_csv('training_set.csv')
test_data = pd.read_csv('testing_set.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'training_set.csv'

In [None]:
 data = train_data.copy()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
cat = [i for i in data.columns if data[i].dtypes == 'O']
num = [i for i in data.columns if data[i].dtypes != 'O']

In [None]:
d_feat = [i for i in data[num].columns if data[i].nunique() <= 15]

In [None]:
cat.extend(d_feat)
len(cat)

In [None]:
for i in data.columns:
    if i in cat:
        data[i] = data[i].fillna('NA')
    else:
        data[i] = data[i].fillna(data[i].median())

In [None]:
data.describe()

----

## Categorical Features

In [None]:
len(cat)

In [None]:
figure = plt.figure(figsize = (15,420))
import itertools

for i,j in itertools.zip_longest(cat, range(1,116,2)):

    plt.subplots_adjust(wspace = 0.2, hspace = 0.2)
    plt.subplot(58,2,j)
    plt.xticks(rotation =90)
    ax = sns.countplot(data = data, x = i, color = 'purple')
    ax.bar_label(ax.containers[0])
    
    plt.subplot(58,2,j+1)
    plt.xticks(rotation =90)
    ax = sns.boxplot(data = data, x = i, y = 'SalePrice', color = 'purple')

# Discrete Numerical Features

In [None]:
d_feat = [i for i in data[num].columns if data[i].nunique() <= 15]

In [None]:
d_feat

In [None]:
figure = plt.figure(figsize = (20,90))
import itertools

for i,j in itertools.zip_longest(d_feat, range(1,30,2)):

    plt.subplots_adjust(wspace = 0.2, hspace = 0.2)
    plt.subplot(15,2,j)
    plt.xticks(rotation = 90)
    ax = sns.countplot(data = data, x = i, color = 'purple')
    ax.bar_label(ax.containers[0])
    
    plt.subplot(15,2,j+1)
    plt.xticks(rotation = 90)
    ax = sns.boxplot(data = data, x = i, y = 'SalePrice', color = 'purple')

# Continuous Numerical Features

In [None]:
c_feat = [i for i in data[num].columns if data[i].nunique() > 15]

In [None]:
len(c_feat)

In [None]:
def pp(a,b,c):
    sns.pairplot(data = data, x_vars = [a,b,c], y_vars = 'SalePrice',size=4, aspect=1, kind='scatter')
    plt.show()

In [None]:
for i,j in itertools.zip_longest(range(1,21,3), range(4,23,3)):
    lst = c_feat[i:j]
    pp(lst[0],lst[1],lst[2])
    
    

In [None]:
figure = plt.figure(figsize = (15,30))
for i,j in itertools.zip_longest(c_feat, range(1,24)):
    plt.subplot(8,3,j)
    plt.subplots_adjust(wspace =0.2, hspace =0.4)
    data[i].hist()
    plt.title(i)
    plt.xticks(rotation = 90)

# Time line data

In [None]:
yr_feat = [i for i in data.columns if 'Yr' in i or 'Year' in i]
yr_feat

In [None]:
dum = data[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 'SalePrice']]
dum['YearBuilt'] = dum['YrSold'] - dum['YearBuilt'] 
dum['YearRemodAdd'] = dum['YrSold'] - dum['YearRemodAdd']
dum['GarageYrBlt'] = dum['YrSold'] - dum['GarageYrBlt']

In [None]:
dum

In [None]:
for i in yr_feat[0:3]:
    sns.scatterplot(data = dum, x = i, y = 'SalePrice', color = 'purple')
    plt.show()
    

----

## Categorical Feature Scaling

### Defining rare category

In [None]:
n_rare = []

for i in cat:
    temp = data.groupby(by = i)['SalePrice'].count() / len(data)
    temp_df = temp[temp>0.01].index
    n_rare.extend(temp_df)
    data[i] = np.where(data[i].isin(n_rare),data[i], 'rare_var')

In [None]:
data[cat]

### Labeling Categorical data

In [None]:
data.columns

In [None]:
dic_labels = {}
for i in cat:
    labels = data.groupby(by = i)['SalePrice'].mean().sort_values().index
    label_ordered = {k:j for j,k in enumerate(labels, 0)}
    dic_labels[i] = label_ordered
    data[i] = data[i].map(label_ordered)

In [None]:
train_unique = []
for i in data[cat].columns:
    a = data[i].unique()
    train_unique.extend(a)
    

In [None]:
dic_labels

In [None]:
data

## Normalizing Continuous Data

In [None]:
norm_feat = [i for i in c_feat if 0 not in data[i].unique() and i not in ['Id']]
norm_feat

In [None]:
for i in norm_feat:
    data[i] = np.log(data[i])

In [None]:
data

----

# Feature Scaling

In [None]:
scaler = MinMaxScaler()

In [None]:
feature_scale = [i for i in data.columns if i not in ['Id', 'SalePrice']]
len(feature_scale)

In [None]:
scaler.fit(data[feature_scale])

In [None]:
tr_data = pd.concat([data[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(data[feature_scale]), columns=feature_scale)],
                    axis=1)

In [None]:
tr_data

----

# Feature Selection

In [None]:
y_train = tr_data['SalePrice']
x_train = tr_data.drop(['Id','SalePrice'], axis = 1)

In [None]:
model = SelectFromModel(Lasso(alpha = 0.01))

In [None]:
model.fit(x_train,y_train)

In [None]:
model.get_support()

In [None]:
selected_feat = x_train.columns[model.get_support()]
selected_feat

In [None]:
len(selected_feat)

In [None]:
x_train = x_train[selected_feat]
x_train.shape

In [None]:
x_train.info()

# Model Training and Evaluation

In [None]:
ridge = Ridge()

In [None]:
hyperparameters ={'alpha' : [1e-5,1e-4,1e-3,1,0.01, 0.1, 0.5, 1, 2, 3, 5, 7, 10 ]}

In [None]:
RR = GridSearchCV(ridge, hyperparameters, scoring = 'neg_mean_squared_error', cv = 5) 

In [None]:
RR.fit(x_train,y_train)

In [None]:
print("Best value for lambda : ",RR.best_params_)
print("Best score for cost function: ", RR.best_score_)

In [None]:
lasso = Lasso()

In [None]:
ls = GridSearchCV(lasso, hyperparameters, scoring = 'neg_mean_squared_error', cv = 5 )

In [None]:
ls.fit(x_train,y_train)

In [None]:
print("Best value for lambda : ",ls.best_params_)
print("Best score for cost function: ", ls.best_score_)

In [None]:
predict_ridge = RR.predict(x_train)
predict_lasso = ls.predict(x_train)

In [None]:
figure = plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
sns.distplot(y_train-predict_ridge)
plt.subplot(1,2,2)
sns.distplot(y_train-predict_lasso)

 ----

# Test Data

In [None]:
df = test_data.copy()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
for i in df.columns:
    if i in cat:
        df[i] = df[i].fillna('NA')
    else:
        df[i] = df[i].fillna(df[i].median())

In [None]:
for i in cat:
    df[i] = np.where(df[i].isin(n_rare), df[i],'rare_var')

In [None]:
df.head()

In [None]:
for i in dic_labels:
    if 'NA' not in dic_labels[i]:
        dar = dic_labels[i][list(dic_labels[i])[-1]] + 1
        dic_labels[i]['NA'] = dar

In [None]:
dic_labels['ExterQual']['Fa'] = 5
dic_labels['Street']['Grvl'] = 3
dic_labels['TotRmsAbvGrd']['12'] = 12
dic_labels['FullBath']['4'] = 4
dic_labels['Exterior2nd']['CBlock'] = 12
dic_labels['Exterior2nd']['Stone'] = 13
dic_labels['RoofStyle']['Shed'] = 1
dic_labels['ExterCond']['Po'] = 6
dic_labels['HeatingQC']['Po'] = 6
dic_labels['Exterior1st']['CBlock'] = 12
dic_labels['Fireplaces']['4'] = 5
dic_labels['BsmtCond']['Po'] = 5
dic_labels['OverallQual']['2'] = 10
dic_labels['OverallQual']['1'] = 11
dic_labels['OverallCond']['2'] = 10
dic_labels['OverallCond']['1'] = 9
dic_labels['GarageCars']['5'] = 6

In [None]:
dic_labels

In [None]:
df['BsmtFullBath'] = df['BsmtFullBath'].astype(str).apply(lambda x: x.replace('.0',''))
df['BsmtHalfBath'] = df['BsmtHalfBath'].astype(str).apply(lambda x: x.replace('.0',''))
df['GarageCars'] = df['GarageCars'].astype(str).apply(lambda x: x.replace('.0',''))

In [None]:
for i in cat:
    df[i] = df[i].map(dic_labels[i])

In [None]:
df['BsmtFullBath'] = df['BsmtFullBath'].astype(int)
df['BsmtHalfBath'] = df['BsmtHalfBath'].astype(str)
df['GarageCars'] = df['GarageCars'].astype(str)

In [None]:
df

In [None]:
for i in norm_feat[0:7]:
    df[i] = np.log(df[i])

In [None]:
df

# Feature Selection for Testing Data

In [None]:
x_test = df[selected_feat]
x_test

In [None]:
for i in x_test.columns:
    x_test[i] = x_test[i].astype(float)

In [None]:
x_test.isnull().sum()

In [None]:
ridgeregressor = RR.predict(x_test)
lassoregressor = ls.predict(x_test)

In [None]:
figure = plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
sns.histplot(ridgeregressor,kde = True)
plt.subplot(1,2,2)
sns.histplot(lassoregressor,kde = True)

In [None]:
result = pd.DataFrame(df['Id'])

In [None]:
result['Prediction'] = lassoregressor

In [None]:
result

In [None]:
result.to_csv('Result', index = False)