In [1]:
# Cleaning Data

# Create target object and call it y

# Create X

# Split into validation and training data

# Specify Model

# Fit Model

# Make validation predictions and calculate mean absolute error

# Show a prediction result and real data.

### Importing libraries and Reading the data.

In [2]:
# Importing the libraries
import pandas as pd
import numpy as np

In [3]:
# Impoirting the dataset
house_prices_dataset_path = "./data/house-prices-advanced-regression-techniques/sample_submission.csv"
house_prices_test_path = "./data/house-prices-advanced-regression-techniques/test.csv"
house_prices_train_path = "./data/house-prices-advanced-regression-techniques/train.csv"
house_data_discription_path = "./data/house-prices-advanced-regression-techniques/data_description.txt"

house_prices = pd.read_csv(house_prices_dataset_path)
house_prices_test = pd.read_csv(house_prices_test_path)
house_prices_train = pd.read_csv(house_prices_test_path)

In [4]:
# Exporting the data (name of columns) to .xlsx file to more convenient work.
df = pd.DataFrame(house_prices_train.columns)
df.to_excel('./auxiliary_files/show_columns.xlsx', sheet_name='Column Names')

### Classes

In [5]:
class Nan_counter:
    """Counting NaN(true) and NaN(false) values"""  
    def __init__(self, data_set):
        self.data = data_set

    def is_nan(self, col_name):
        """returning number of the NaN cells. In view True and False:
        True     1352
        False     107
        Name: column_name, dtype: int64"""
        return pd.isnull(self.data[self.col_name]).value_counts()
    
    def missing_values(self, accuracy):
        """accuracy in percentage
        #.missing_values(.01) - .01 is an accuracy in percent. 
        IOW, takes all values that less then .01 percent."""
        i = self.data.isnull().sum()
        j = len(self.data)
        y = round(i/j, 2)  
        return dict(y[y <= accuracy])

In [6]:
class Nan_columns:
    """
    - Finding the column(s) in dataframe that stored need number of NaN values,
    - Create a list with those columns"""
    def __init__(self, data_set):
        self.data = data_set
        
    def min_nan_number_in_column(self, min_required_nan):
        """finding the columns that stroed NaN where each column stored less or equal then # NaN values."""
        columns_of_dataset = self.data.columns.tolist()
        best_columns = []

        for i in columns_of_dataset:
            nan_sum = self.data[f"{i}"].isna().sum()            
            if nan_sum <= min_required_nan:
                best_columns.append(i)
        
        return best_columns
    
    def bad_column(self, val):
        """Select only those columns from the dataset with too many NaN rows.
           - `val` is percentage of bad rows"""
        columns_of_dataset = self.data.columns.tolist()
        bad_columns = []

        for i in columns_of_dataset:
            nan_sum = self.data[f"{i}"].isna().sum()            
            if val <= (nan_sum / len(self.data[f"{i}"])):
                bad_columns.append(i)
        
        return bad_columns

In [7]:
class Nan_string:
    """Show the string(s) that stored NaN value(s)"""
    def __init__(self, dataframe):
        self.df = dataframe
        
    def show_nan_string_in_dataframe(self):
        """Return only those strings that contain NaN values"""
        # creating and filling a dictionary (#str: col_name)
        nan_str_col = dict([(x, self.df.columns[y]) for x, y in zip(*np.where(self.df.isna()))])
        
        # takes only keys from dictionary
        keys = list(nan_str_col.keys())

        return self.df.loc[keys]
    
    def show_nan_string_in_column(self, col_name):
        count = 0 
        missing_str = []
        for i in house_prices_test[col_name]: 
            if i != i:
                missing_str.append(house_prices_test[col_name].index[count])
            count += 1
        return missing_str
        
    def show_nan_dict(self):
        """Return only number of the string and a name of the column that string(s) storing NaN values"""
        return dict([(x, self.df.columns[y]) for x, y in zip(*np.where(self.df.isna()))])
    


In [8]:
class Missing_column:
    """Differece between two lists(dataset and custom_sample."""
    def find_miss_col(self, col_dataframe, col_custom_sample):
        diff = set(col_dataframe.columns) - set(col_custom_sample)
        return diff

### Review Data (Understanding the Data)

In [9]:
# Counting the unique values inside column.
print(house_prices_test['LandContour'].value_counts(sort=False))

Bnk      54
Low      24
Lvl    1311
HLS      70
Name: LandContour, dtype: int64


In [10]:
# reviewing all columns and find number NaN strings inside.
# - takes all columns that NaN values(number of strings) are less than 1 percent.
x = Nan_counter(house_prices_test).missing_values(.01)

# Converting all got result in a list.
features_01 = list(x)

print(len(features_01))
print(features_01)

64
['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']


In [71]:
# Select all columns which storing less or equal than # NaN strings.
features_02 = Nan_columns(house_prices_test).min_nan_number_in_column(4)

print(len(features_02))
print(features_02)

62
['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']


In [107]:
# Select only those columns from the dataset with too many(greater than 30 percent) NaN rows.
bad_data = Nan_columns(house_prices_test[features_02]).bad_column(.3)

print(len(bad_data))
print(bad_data)

0
[]


In [108]:
# finding the missing values columns from dataset to restructure them to improve the forcast accuracy.
i = Missing_column().find_miss_col(house_prices_test, bad_data)
missing_columns = i

print(house_prices_test[missing_columns].isnull().sum())
print(house_prices_test[missing_columns].shape)

LotShape            0
LotArea             0
MSSubClass          0
EnclosedPorch       0
Electrical          0
                 ... 
3SsnPorch           0
ExterQual           0
PoolQC           1456
MoSold              0
MiscVal             0
Length: 80, dtype: int64
(1459, 80)


In [14]:
# Show str_num and col_name that storing NaN values.
show_nan_string = Nan_string(house_prices_test[missing_columns])

In [15]:
# Observing those strings in DataFrame that storing NaN values. String evaluation, visually analysis.
missing_val_list = pd.DataFrame(show_nan_string.show_nan_string_in_dataframe())
missing_val_list = missing_val_list.rename_axis('ID', index='columns')
missing_val_list

Unnamed: 0_level_0,BsmtFinSF2,KitchenQual,BsmtFinType1,GarageYrBlt,GarageQual,MasVnrType,BsmtFinSF1,BsmtCond,SaleType,GarageCars,...,GarageCond,GarageArea,LotFrontage,MiscFeature,BsmtUnfSF,Alley,FireplaceQu,MasVnrArea,PoolQC,Functional
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,144.0,TA,Rec,1961.0,TA,,468.0,TA,WD,1.0,...,TA,730.0,80.0,,270.0,,,0.0,,Typ
1,0.0,Gd,ALQ,1958.0,TA,BrkFace,923.0,TA,WD,1.0,...,TA,312.0,81.0,Gar2,406.0,,,108.0,,Typ
2,0.0,TA,GLQ,1997.0,TA,,791.0,TA,WD,2.0,...,TA,482.0,74.0,,137.0,,TA,0.0,,Typ
3,0.0,Gd,GLQ,1998.0,TA,BrkFace,602.0,TA,WD,2.0,...,TA,470.0,78.0,,324.0,,Gd,20.0,,Typ
4,0.0,Gd,ALQ,1992.0,TA,,263.0,TA,WD,2.0,...,TA,506.0,43.0,,1017.0,,,0.0,,Typ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,TA,Unf,,,,0.0,TA,WD,0.0,...,,0.0,21.0,,546.0,,,0.0,,Typ
1455,0.0,TA,Rec,1970.0,TA,,252.0,TA,WD,1.0,...,TA,286.0,21.0,,294.0,,,0.0,,Typ
1456,0.0,TA,ALQ,1960.0,TA,,1224.0,TA,WD,2.0,...,TA,576.0,160.0,,0.0,,TA,0.0,,Typ
1457,0.0,TA,GLQ,,,,337.0,TA,WD,0.0,...,,0.0,62.0,Shed,575.0,,,0.0,,Typ


### Recognize, Fill or drop data

In [16]:
list_num_of_nan_str = Nan_string(missing_val_list).show_nan_string_in_column('MSZoning')
list_num_of_nan_str

[455, 756, 790, 1444]

In [17]:
missing_val_list.loc[454:456][['MSZoning']]

Unnamed: 0_level_0,MSZoning
ID,Unnamed: 1_level_1
454,RM
455,
456,RL


In [18]:
# Remove bad columns.
missing_val_list = missing_val_list.drop(columns=bad_data)
missing_val_list

Unnamed: 0_level_0,BsmtFinSF2,KitchenQual,BsmtFinType1,GarageYrBlt,GarageQual,MasVnrType,BsmtFinSF1,BsmtCond,SaleType,GarageCars,...,TotalBsmtSF,BsmtFinType2,BsmtHalfBath,GarageType,GarageCond,GarageArea,LotFrontage,BsmtUnfSF,MasVnrArea,Functional
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,144.0,TA,Rec,1961.0,TA,,468.0,TA,WD,1.0,...,882.0,LwQ,0.0,Attchd,TA,730.0,80.0,270.0,0.0,Typ
1,0.0,Gd,ALQ,1958.0,TA,BrkFace,923.0,TA,WD,1.0,...,1329.0,Unf,0.0,Attchd,TA,312.0,81.0,406.0,108.0,Typ
2,0.0,TA,GLQ,1997.0,TA,,791.0,TA,WD,2.0,...,928.0,Unf,0.0,Attchd,TA,482.0,74.0,137.0,0.0,Typ
3,0.0,Gd,GLQ,1998.0,TA,BrkFace,602.0,TA,WD,2.0,...,926.0,Unf,0.0,Attchd,TA,470.0,78.0,324.0,20.0,Typ
4,0.0,Gd,ALQ,1992.0,TA,,263.0,TA,WD,2.0,...,1280.0,Unf,0.0,Attchd,TA,506.0,43.0,1017.0,0.0,Typ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,TA,Unf,,,,0.0,TA,WD,0.0,...,546.0,Unf,0.0,,,0.0,21.0,546.0,0.0,Typ
1455,0.0,TA,Rec,1970.0,TA,,252.0,TA,WD,1.0,...,546.0,Unf,0.0,CarPort,TA,286.0,21.0,294.0,0.0,Typ
1456,0.0,TA,ALQ,1960.0,TA,,1224.0,TA,WD,2.0,...,1224.0,Unf,0.0,Detchd,TA,576.0,160.0,0.0,0.0,Typ
1457,0.0,TA,GLQ,,,,337.0,TA,WD,0.0,...,912.0,Unf,1.0,,,0.0,62.0,575.0,0.0,Typ


In [19]:
missing_val_list.isnull().sum()
list(missing_val_list.columns)
missing_val_list = missing_val_list.drop(columns=[
    'GarageQual',
    'GarageCars',
    'GarageType',
    'GarageArea', 
    'GarageFinish',
    'GarageCond',
    'GarageYrBlt'])
missing_val_list

Unnamed: 0_level_0,BsmtFinSF2,KitchenQual,BsmtFinType1,MasVnrType,BsmtFinSF1,BsmtCond,SaleType,Exterior2nd,BsmtFullBath,MSZoning,...,BsmtExposure,Exterior1st,Utilities,TotalBsmtSF,BsmtFinType2,BsmtHalfBath,LotFrontage,BsmtUnfSF,MasVnrArea,Functional
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,144.0,TA,Rec,,468.0,TA,WD,VinylSd,0.0,RH,...,No,VinylSd,AllPub,882.0,LwQ,0.0,80.0,270.0,0.0,Typ
1,0.0,Gd,ALQ,BrkFace,923.0,TA,WD,Wd Sdng,0.0,RL,...,No,Wd Sdng,AllPub,1329.0,Unf,0.0,81.0,406.0,108.0,Typ
2,0.0,TA,GLQ,,791.0,TA,WD,VinylSd,0.0,RL,...,No,VinylSd,AllPub,928.0,Unf,0.0,74.0,137.0,0.0,Typ
3,0.0,Gd,GLQ,BrkFace,602.0,TA,WD,VinylSd,0.0,RL,...,No,VinylSd,AllPub,926.0,Unf,0.0,78.0,324.0,20.0,Typ
4,0.0,Gd,ALQ,,263.0,TA,WD,HdBoard,0.0,RL,...,No,HdBoard,AllPub,1280.0,Unf,0.0,43.0,1017.0,0.0,Typ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,TA,Unf,,0.0,TA,WD,CmentBd,0.0,RM,...,No,CemntBd,AllPub,546.0,Unf,0.0,21.0,546.0,0.0,Typ
1455,0.0,TA,Rec,,252.0,TA,WD,CmentBd,0.0,RM,...,No,CemntBd,AllPub,546.0,Unf,0.0,21.0,294.0,0.0,Typ
1456,0.0,TA,ALQ,,1224.0,TA,WD,VinylSd,1.0,RL,...,No,VinylSd,AllPub,1224.0,Unf,0.0,160.0,0.0,0.0,Typ
1457,0.0,TA,GLQ,,337.0,TA,WD,Wd Shng,0.0,RL,...,Av,HdBoard,AllPub,912.0,Unf,1.0,62.0,575.0,0.0,Typ


In [20]:
# Fill the column that storing greater than 4 missing values
issue_last = missing_val_list.isnull().sum()
issue_last = list(issue_last[issue_last>4].index)
issue_last

one_val_list = pd.DataFrame(house_prices_test[issue_last])
one_val_list = one_val_list.rename_axis('ID', index='columns')
# one_val_list = one_val_list.dropna()
one_val_list

Unnamed: 0_level_0,BsmtFinType1,MasVnrType,BsmtCond,BsmtQual,BsmtExposure,BsmtFinType2,LotFrontage,MasVnrArea
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Rec,,TA,TA,No,LwQ,80.0,0.0
1,ALQ,BrkFace,TA,TA,No,Unf,81.0,108.0
2,GLQ,,TA,Gd,No,Unf,74.0,0.0
3,GLQ,BrkFace,TA,TA,No,Unf,78.0,20.0
4,ALQ,,TA,Gd,No,Unf,43.0,0.0
...,...,...,...,...,...,...,...,...
1454,Unf,,TA,TA,No,Unf,21.0,0.0
1455,Rec,,TA,TA,No,Unf,21.0,0.0
1456,ALQ,,TA,TA,No,Unf,160.0,0.0
1457,GLQ,,TA,Gd,Av,Unf,62.0,0.0


In [21]:
# Concatanation several dataframes.

# clear_data = pd.concat([one_val_list, multi_val_list], axis=1)
# clear_data

# clear_data = pd.merge(missing_val_list,one_val_list,on='ID')
# clear_data

### Hand Work

In [62]:
one_val_list.isnull().sum()

BsmtFinType1    0
MasVnrType      0
BsmtCond        0
BsmtQual        0
BsmtExposure    0
BsmtFinType2    0
LotFrontage     0
MasVnrArea      0
dtype: int64

In [23]:
# one_val_list[col_name]=one_val_list[col_name].fillna('None')

# one_val_list[col_name]=one_val_list[col_name].fillna(one_val_list[col_name].mean())

In [24]:
col_name = 'BsmtQual'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna('NA')

The unique elements:

TA    634
Gd    591
Ex    137
Fa     53
Name: BsmtQual, dtype: int64

The Null values: 44


In [25]:
col_name = 'BsmtFinType1'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna('NA')

The unique elements:

GLQ    431
Unf    421
ALQ    209
Rec    155
BLQ    121
LwQ     80
Name: BsmtFinType1, dtype: int64

The Null values: 42


In [26]:
col_name = 'MasVnrArea'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna(one_val_list[col_name].mean())

The unique elements:

0.0      877
176.0     10
144.0      9
120.0      8
216.0      8
        ... 
634.0      1
177.0      1
615.0      1
549.0      1
442.0      1
Name: MasVnrArea, Length: 303, dtype: int64

The Null values: 15


In [27]:
col_name = 'BsmtExposure'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna('NA')

The unique elements:

No    951
Av    197
Gd    142
Mn    125
Name: BsmtExposure, dtype: int64

The Null values: 44


In [28]:
col_name = 'BsmtFinType2'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna('NA')

The unique elements:

Unf    1237
Rec      51
LwQ      41
BLQ      35
ALQ      33
GLQ      20
Name: BsmtFinType2, dtype: int64

The Null values: 42


In [29]:
col_name = 'MasVnrType'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna('NA')

The unique elements:

None       878
BrkFace    434
Stone      121
BrkCmn      10
Name: MasVnrType, dtype: int64

The Null values: 16


In [30]:
col_name = 'LotFrontage'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna(one_val_list[col_name].mean())

The unique elements:

60.0     133
80.0      68
70.0      63
50.0      60
75.0      52
        ... 
22.0       1
136.0      1
149.0      1
31.0       1
131.0      1
Name: LotFrontage, Length: 115, dtype: int64

The Null values: 227


In [31]:
col_name = 'BsmtExposure'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna('NA')

The unique elements:

No    951
Av    197
Gd    142
Mn    125
NA     44
Name: BsmtExposure, dtype: int64

The Null values: 0


In [32]:
col_name = 'BsmtCond'
print("The unique elements:\n")
print(one_val_list[col_name].value_counts())
print("\nThe Null values:", one_val_list[col_name].isnull().sum())
one_val_list[col_name]=one_val_list[col_name].fillna('NA')

The unique elements:

TA    1295
Fa      59
Gd      57
Po       3
Name: BsmtCond, dtype: int64

The Null values: 45


In [33]:
# Droping old(non clear) values.
clear_data = missing_val_list.drop(columns=list(one_val_list))
clear_data

Unnamed: 0_level_0,BsmtFinSF2,KitchenQual,BsmtFinSF1,SaleType,Exterior2nd,BsmtFullBath,MSZoning,Exterior1st,Utilities,TotalBsmtSF,BsmtHalfBath,BsmtUnfSF,Functional
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,144.0,TA,468.0,WD,VinylSd,0.0,RH,VinylSd,AllPub,882.0,0.0,270.0,Typ
1,0.0,Gd,923.0,WD,Wd Sdng,0.0,RL,Wd Sdng,AllPub,1329.0,0.0,406.0,Typ
2,0.0,TA,791.0,WD,VinylSd,0.0,RL,VinylSd,AllPub,928.0,0.0,137.0,Typ
3,0.0,Gd,602.0,WD,VinylSd,0.0,RL,VinylSd,AllPub,926.0,0.0,324.0,Typ
4,0.0,Gd,263.0,WD,HdBoard,0.0,RL,HdBoard,AllPub,1280.0,0.0,1017.0,Typ
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,TA,0.0,WD,CmentBd,0.0,RM,CemntBd,AllPub,546.0,0.0,546.0,Typ
1455,0.0,TA,252.0,WD,CmentBd,0.0,RM,CemntBd,AllPub,546.0,0.0,294.0,Typ
1456,0.0,TA,1224.0,WD,VinylSd,1.0,RL,VinylSd,AllPub,1224.0,0.0,0.0,Typ
1457,0.0,TA,337.0,WD,Wd Shng,0.0,RL,HdBoard,AllPub,912.0,1.0,575.0,Typ


In [34]:
# Concatenating dataframes in single, clear dataframe.
clear_data = pd.concat([clear_data, one_val_list], axis=1)
clear_data = clear_data.dropna()
clear_data

Unnamed: 0_level_0,BsmtFinSF2,KitchenQual,BsmtFinSF1,SaleType,Exterior2nd,BsmtFullBath,MSZoning,Exterior1st,Utilities,TotalBsmtSF,...,BsmtUnfSF,Functional,BsmtFinType1,MasVnrType,BsmtCond,BsmtQual,BsmtExposure,BsmtFinType2,LotFrontage,MasVnrArea
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,144.0,TA,468.0,WD,VinylSd,0.0,RH,VinylSd,AllPub,882.0,...,270.0,Typ,Rec,,TA,TA,No,LwQ,80.0,0.0
1,0.0,Gd,923.0,WD,Wd Sdng,0.0,RL,Wd Sdng,AllPub,1329.0,...,406.0,Typ,ALQ,BrkFace,TA,TA,No,Unf,81.0,108.0
2,0.0,TA,791.0,WD,VinylSd,0.0,RL,VinylSd,AllPub,928.0,...,137.0,Typ,GLQ,,TA,Gd,No,Unf,74.0,0.0
3,0.0,Gd,602.0,WD,VinylSd,0.0,RL,VinylSd,AllPub,926.0,...,324.0,Typ,GLQ,BrkFace,TA,TA,No,Unf,78.0,20.0
4,0.0,Gd,263.0,WD,HdBoard,0.0,RL,HdBoard,AllPub,1280.0,...,1017.0,Typ,ALQ,,TA,Gd,No,Unf,43.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,TA,0.0,WD,CmentBd,0.0,RM,CemntBd,AllPub,546.0,...,546.0,Typ,Unf,,TA,TA,No,Unf,21.0,0.0
1455,0.0,TA,252.0,WD,CmentBd,0.0,RM,CemntBd,AllPub,546.0,...,294.0,Typ,Rec,,TA,TA,No,Unf,21.0,0.0
1456,0.0,TA,1224.0,WD,VinylSd,1.0,RL,VinylSd,AllPub,1224.0,...,0.0,Typ,ALQ,,TA,TA,No,Unf,160.0,0.0
1457,0.0,TA,337.0,WD,Wd Shng,0.0,RL,HdBoard,AllPub,912.0,...,575.0,Typ,GLQ,,TA,Gd,Av,Unf,62.0,0.0


In [35]:
# Check bad data (NaN strings) in a new dataframe.
clear_data.isnull().sum()

BsmtFinSF2      0
KitchenQual     0
BsmtFinSF1      0
SaleType        0
Exterior2nd     0
BsmtFullBath    0
MSZoning        0
Exterior1st     0
Utilities       0
TotalBsmtSF     0
BsmtHalfBath    0
BsmtUnfSF       0
Functional      0
BsmtFinType1    0
MasVnrType      0
BsmtCond        0
BsmtQual        0
BsmtExposure    0
BsmtFinType2    0
LotFrontage     0
MasVnrArea      0
dtype: int64

In [59]:
test_col = ['Id', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'OverallQual', 'OverallCond', 'YearBuilt',
       'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch']

In [61]:
house_prices_train[test_col].isnull().sum()

Id                 0
MSZoning           4
LotFrontage      227
LotArea            0
Street             0
LotShape           0
LandContour        0
Utilities          2
OverallQual        0
OverallCond        0
YearBuilt          0
ExterQual          0
ExterCond          0
Foundation         0
Heating            0
HeatingQC          0
CentralAir         0
Electrical         0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
PavedDrive         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
dtype: int64