# Load Data and import Dependencies

In [None]:
!pip install ucimlrepo
!pip install xlrd

In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression

In [None]:
# fetch dataset from uci
concrete_compressive_strength = fetch_ucirepo(id=165) 

# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 
  
# metadata 
print(concrete_compressive_strength.metadata) 
  
# variable information 
print(concrete_compressive_strength.variables) 

In [None]:
X

In [None]:
y

In [None]:
# Import data from local source
# file = r'data/Concrete_Data.xls'
df = pd.read_excel('data/Concrete_Data.xls')
df

In [None]:
df.columns = df.columns.str.strip()

In [None]:
# Renaming columns with direct mapping
df.rename(columns={
    'Cement (component 1)(kg in a m^3 mixture)': 'Cement',
    'Blast Furnace Slag (component 2)(kg in a m^3 mixture)': 'Blast Furnace Slag', 
    'Fly Ash (component 3)(kg in a m^3 mixture)': 'Fly Ash',
    'Water (component 4)(kg in a m^3 mixture)': 'Water',
    'Superplasticizer (component 5)(kg in a m^3 mixture)': 'Superplasticizer',
    'Coarse Aggregate (component 6)(kg in a m^3 mixture)': 'Coarse Aggregate',
    'Fine Aggregate (component 7)(kg in a m^3 mixture)': 'Fine Aggregate',
    'Age(day)': 'Age',
    'Concrete compressive strength(MPa, megapascals)': 'Concrete compressive strength'
}, inplace=True)

In [None]:
# Renaming columns with regex
import re

rename_dict = {
    r'.*Cement.*': 'Cement',
    r'.*Blast Furnace Slag.*': 'Blast Furnace Slag',
    r'.*Fly Ash.*': 'Fly Ash',
    r'.*Water.*': 'Water',
    r'.*Superplasticizer.*': 'Superplasticizer',
    r'.*Coarse Aggregate.*': 'Coarse Aggregate',
    r'.*Fine Aggregate.*': 'Fine Aggregate',
    r'.*Age.*': 'Age',
    r'.*Concrete compressive strength.*': 'Concrete compressive strength'
}

df.columns = [next((v for k, v in rename_dict.items() if re.match(k, col)), col) for col in df.columns]

In [None]:
df

# Data Analysis and Visualization

In [None]:
df.isnull()

In [None]:
# Create a figure with 3x3 subplots, setting the overall figure size to 18x15 inches
fig, axes = plt.subplots(3, 3, figsize=(18, 15))

# Set the main title for the entire figure
fig.suptitle('Understranding Concrete Comprehensive Strength - 3 x 3 axes Box plot with data')

# Create scatter plots for each feature vs Concrete compressive strength
# Row 0: Cement, Blast Furnace Slag, Fly Ash
sns.scatterplot(ax=axes[0, 0], data=df, x=df['Concrete compressive strength'], y=df['Cement'])
sns.scatterplot(ax=axes[0, 1], data=df, x=df['Concrete compressive strength'], y=df['Blast Furnace Slag'])
sns.scatterplot(ax=axes[0, 2], data=df, x=df['Concrete compressive strength'], y=df['Fly Ash'])

# Row 1: Water, Superplasticizer, Coarse Aggregate
sns.scatterplot(ax=axes[1, 0], data=df, x=df['Concrete compressive strength'], y=df['Water'])
sns.scatterplot(ax=axes[1, 1], data=df, x=df['Concrete compressive strength'], y=df['Superplasticizer'])
sns.scatterplot(ax=axes[1, 2], data=df, x=df['Concrete compressive strength'], y=df['Coarse Aggregate'])

# Row 2: Fine Aggregate, Age
sns.scatterplot(ax=axes[2, 0], data=df, x=df['Concrete compressive strength'], y=df['Fine Aggregate'])
sns.scatterplot(ax=axes[2, 1], data=df, x=df['Concrete compressive strength'], y=df['Age'])

In [None]:
features = df.drop('Concrete compressive strength', axis=1)

In [None]:
# generating pairwise correlation 
corr = features.corr() 
  
# Displaying dataframe as an heatmap  
# with diverging colourmap as coolwarm 
corr.style.background_gradient(cmap ='coolwarm') 

# Split your dataset

## Feature Engineering and Cleaning

In [None]:
df.dropna()

In [None]:
df.shape

In [None]:
X = features = df.drop('Concrete compressive strength', axis=1)
y = target = df['Concrete compressive strength']

### 80 - 20 split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Selection & Engineering

In [None]:
linear = LinearRegression()
linear.fit(X_train, y_train)

print("Accuracy score on training {:.4f}".format(linear.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(linear.score(X_test,y_test)))

In [None]:
# Model Selection
ridge = Ridge(max_iter=1000000)
ridge.fit(X_train, y_train)

In [None]:
ridge.fit(X_train, y_train)
print("Accuracy score on training {:.4f}".format(ridge.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(ridge.score(X_test,y_test)))

In [None]:
# Model Selection - best fit for ridge
ridge100 = Ridge(alpha=100, max_iter=1000000)
ridge100.fit(X_train, y_train)

ridge.fit(X_train, y_train)
print("Accuracy score on training {:.4f}".format(ridge100.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(ridge100.score(X_test,y_test)))

In [None]:
# Model Selection
ridge0001 = Ridge(alpha=0.0001, max_iter=1000000)
ridge0001.fit(X_train, y_train)

ridge.fit(X_train, y_train)
print("Accuracy score on training {:.4f}".format(ridge0001.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(ridge0001.score(X_test,y_test)))

#### Ridge Regularization Impact vs LinearRegression

In [None]:
plt.plot(ridge.coef_,'v', label="ridge Coefficient")
plt.plot(ridge100.coef_,'^', label="Ridge100 Coefficient")
plt.plot(ridge0001.coef_,'o', label="Ridge0001 Coefficient")

plt.plot(linear.coef_,'s', label="Linear Coefficient")
plt.hlines(0,0, len(linear.coef_))
plt.ylabel("Coefficient Magnitude")
plt.xlabel("Coefficient Index")
plt.ylim(-1,1)
plt.legend()

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(max_iter=1000000)

lasso.fit(X_train, y_train)
print("Accuracy score on training {:.4f}".format(lasso.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(lasso.score(X_test,y_test)))
print("Number of features {}".format(np.sum(lasso.coef_ != 0)))

In [None]:
# Lasso0001 - Best fit for lasso
lasso0001 = Lasso(alpha=0.0001, max_iter=1000000)
lasso0001.fit(X_train, y_train)
print("Accuracy score on training {:.4f}".format(lasso0001.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(lasso0001.score(X_test,y_test)))
print("Number of features {}".format(np.sum(lasso0001.coef_ != 0)))

In [None]:
lasso10 = Lasso(alpha=10, max_iter=1000000)

lasso10.fit(X_train, y_train)
print("Accuracy score on training {:.4f}".format(lasso10.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(lasso10.score(X_test,y_test)))
print("Number of features {}".format(np.sum(lasso10.coef_ != 0)))

In [None]:
# Lasso100 - lowest
lasso100 = Lasso(alpha=100,max_iter=1000000)

lasso100.fit(X_train, y_train)
print("Accuracy score on training {:.4f}".format(lasso100.score(X_train,y_train)))
print("Accuracy score on testing {:.4f}".format(lasso100.score(X_test,y_test)))
print("Number of features {}".format(np.sum(lasso100.coef_ != 0)))

#### Lasso Regularization Impact vs LinearRegression

In [None]:
plt.plot(lasso.coef_,'v', label="ridge Coefficient")
plt.plot(lasso0001.coef_,'^', label="Lasso0001 Coefficient")
plt.plot(lasso10.coef_,'o', label="Lasso10 Coefficient")
plt.plot(lasso100.coef_,'o', label="Lasso100 Coefficient")

plt.plot(linear.coef_,'s', label="Linear Coefficient")
plt.hlines(0,0, len(linear.coef_))
plt.ylabel("Coefficient Magnitude")
plt.xlabel("Coefficient Index")
plt.ylim(-1,1)
plt.legend()

# Predictions using best fit model

In [None]:
y_pred = linear.predict(X_test)
results_linear = pd.Series(y_pred)
results_linear

In [None]:
y_pred_ridge = ridge100.predict(X_test)
results_ridge = pd.Series(y_pred_ridge)
results_ridge

In [None]:
y_pred_lasso = lasso0001.predict(X_test)
results_lasso = pd.Series(y_pred_lasso)
results_lasso

In [None]:
df_graphs = pd.DataFrame({'Linear_pred': results_linear, 'Lasso_pred': results_lasso, 'Ridge_pred': results_ridge })
fd = df_graphs.cumsum()
plt.figure()
fd.plot()