<a href="https://colab.research.google.com/github/Mark-Kinyua/Task1_ML_Group2/blob/main/group_task2_ML_ICS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
%matplotlib inline


In [None]:
from google.colab import files
# Importing of data (modified_data.csv)
uploaded = files.upload()

In [None]:
# Placing csv into a dataframe
df = pd.read_csv(io.BytesIO(uploaded['modified_data.csv']))
data=pd.DataFrame(df)

# Show  loaded data
data

In [None]:
# First 5 rows 
data.head()

In [None]:
# Dimensions of dataset
data.shape

**Data Preprocessing**

In [None]:
#The Id column may not be all too relevant, hence dropping it.
data.drop('Id', axis=1, inplace=True)
data.head()

In [None]:
initial_features = list(data.columns)

In [None]:
# Checking for missing values in the dataset
data.isnull().sum()

In [None]:
# Using percentage values
100* data.isnull().sum()/len(data)

 *Looking more into the features*


In [None]:
feats = list(data.columns) # list structure of features
len(feats)# size of the list object (number of columns)

In [None]:
features_with_missing_data = list(data.columns[data.isna().any()]) # list object of features (columns) with missing data
len(features_with_missing_data) # size of list object (number of columns with missing values)

list the all features with missing data against their percentage values 

In [None]:
for i in features_with_missing_data:
    print(i, ' ', 100* data[i].isnull().sum()/len(data))

In [None]:
# Dropping features with more than 50% missing data
data.dropna(axis=1, thresh=0.5 * (len(data)), inplace=True)

**Filling in missing values Using backward and forward filing**

In [None]:
mean_fill=['LotFrontage', 'MasVnrArea']
bfill_data = ['FireplaceQu']
ffill_data = list(set(features_with_missing_data)-set(mean_fill)-set(bfill_data))

In [None]:
ffill_data

In [None]:
for i in mean_fill:
    data[i].fillna(data[i].mean(), inplace=True)

In [None]:
# Backward filing
for j in bfill_data:
    data[j].fillna(method = 'bfill', inplace=True)

In [None]:
# Forward filing
for k in ffill_data:
    data[k].fillna(method = 'ffill', inplace=True)

In [None]:
data.isnull().sum()

Encoding the Dataset
This will ensure the model does not put weights to what need not weighting.
List all non-numeric columns

Extract Categorical Data

In [None]:
categorical_features = list(data.select_dtypes(include=['object']).copy().columns)

In [None]:
categorical_features[:5]

In [None]:
len(categorical_features)

In [None]:
nom_data=['MSZoning', 'LandContour', 'LotConfig','Neighborhood','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','Foundation','BsmtFinType1','CentralAir']
ord_data = list(set(categorical_features)-set(nom_data))
num_data = list(set(initial_features)-set(categorical_features))
target = ['SalePrice']

In [None]:
data[num_data]

In [None]:
for i in ord_data:
    data[i] = (data[i].astype('category')).cat.codes

In [None]:
df_ord = data[ord_data]

Implementing One Hot Encoding This is for the norminal data

In [None]:
df_nom = pd.get_dummies(data[nom_data])

Putting the data back together

In [None]:
df_num = data[num_data]

In [None]:
joined_data = pd.concat([df_num, df_nom, df_ord], axis=1)

In [None]:
joined_data.head()

Standardize the dataset

In [None]:
scaler = StandardScaler()

In [None]:
df_X = joined_data.drop('SalePrice', axis=1)
X = np.array(df_X)

In [None]:
df_y = data[target]
y = np.array(df_y)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X=scaler.fit_transform(X)
y=scaler.fit_transform(y)

***Feature Selection using L1***

In [None]:
regressor = LassoCV()
regressor.fit(X,y)

In [None]:
#print(regressor.coef_)
coef_col = pd.Series(regressor.coef_,index=df_X.columns)

#print(coef_col)
sorted_coef = coef_col.sort_values()
plt.rcParams['figure.figsize']=(10.0,7.0)
sorted_coef.plot(kind="barh")

no_of_selected_features = sum(coef_col != 0)
no_of_rejected_features = sum(coef_col == 0)
total_features = no_of_selected_features + no_of_rejected_features
print("L1 selected only ",no_of_selected_features," features out of ",total_features," from the transformed dataset ")
plt.title("Feature Selection Using L1 Embedder")

***Retrieve the selected features***

In [None]:
selected_features_list = []
for label, value in zip(coef_col.index, coef_col.values):
    if value != 0:
        selected_features_list.append(label)

selected_features_list

In [None]:
standardized_df = pd.DataFrame(X, columns=df_X.columns)

In [None]:
X = standardized_df[selected_features_list]
X

***Split the Dataset***

In [None]:
y = pd.DataFrame(y, columns = df_y.columns)
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
X_train.shape

***Train the Model***

**Build the sequential model to be trained on the data**

In [None]:
model = Sequential()

In [None]:
# input layer
model.add(Dense(78, activation = 'relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(39, activation = 'relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(19, activation = 'relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(units=1, activation = 'sigmoid'))

# compile model
model.compile(loss='mse', optimizer='adam')

**Fit the model to the training data**

In [None]:
model.fit(X_train, y_train,
         validation_data=(X_test, y_test),
         epochs = 100,
         batch_size=256
         )

**Evaluate Model Performance**

In [None]:
losses = pd.DataFrame(model.history.history)
losses

In [None]:
losses.plot()

In [None]:
y_hat = model.predict(X_test)

In [None]:
mean_absolute_error(y_test,y_hat)

In [None]:
mean_squared_error(y_test, y_hat)

In [None]:
# Predicted values vs Actual Values
plt.scatter(y_test, y_hat)