In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# import necessary libraries
import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('data/Concrete Compressive Strength.csv')

In [4]:
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age (day),Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


# Data Cleaning and Preprocessing

In [None]:
df.describe()

In [None]:
df.info()

# Checking for Missing values

In [None]:
df.isnull().sum()

In [None]:
# Visualizing missing values
missing_values = df.isnull().sum()

#  create a bar plot
plt.figure(figsize=(12,6))
sns.barplot(x=missing_values, y= missing_values.values, color="red")
plt.xticks(rotation=90)
plt.title("Missing values in Each column")
plt.ylabel("Number of Missing values")
plt.xlabel("columns")
plt.show()

# Outlier Detection

In [None]:
cols = df.columns
plt.figure(figsize=(16, 20))

for i, col in enumerate(cols):
    plt.subplot(len(cols) // 3 + 1, 3, i + 1)
    sns.boxplot(y=df[col])
    plt.title(f"Boxplot for {col}")

plt.tight_layout()
plt.show()

In [None]:
pip install ydata-profiling

# EDA

In [None]:
# EDA using pandas profiling
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile.to_notebook_iframe()

#### so from the above observations we have seen that, There is no Missing values , Outliers is present and duplicates values are there. so we need to work on it.

# Correlation Analysis

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(df.corr(), annot=True, fmt='0.2f', annot_kws={'size': 15}, linewidths=2, linecolor='white')
plt.show()

# Insights from correlation matrix
1. Cement and concreate compressive strength :- Correlation is 0.50, which is moderate positive corrleation.
2. Flyash and superplasticizer :- correlation is 0.38 , which is moderate positive correlation.
3. Cement and water :- correlation is -0.08, which is very weak  negative correlation, suggesting there is no significant relationship between the amounts of water and cement used in the concrete mix.
4. water and Superplasticizer :- correlation is -0.66, which is strong negative correlation ,suggesting that more superplasticizer is used in less water used.


<!-- indi -->

In [None]:
# Pair plot
sns.pairplot(df,diag_kind = 'kde')
plt.show()
plt.savefig('pairplot.png')

# Feature Engineering

<!--  -->

In [None]:
from scipy import stats

def handle_outliers(df, threshold=5):
    df_no_outliers = pd.DataFrame()
    outliers = pd.DataFrame()
    
    for col in df.columns:
        z_scores = stats.zscore(df[col])
        outlier_indices = (z_scores > threshold) | (z_scores < -threshold)
        df_no_outliers[col] = df[col][~outlier_indices]
        outliers[col] = df[col][outlier_indices]
        
    return df_no_outliers, outliers


In [None]:
df_no_outliers, outliers = handle_outliers(df, threshold=3)
# Check for outliers
if outliers.empty:
    print("No outliers detected.")
else:
    print("Outliers detected in the following columns:")
    print(outliers)

In [None]:
df_combined = pd.concat([df_no_outliers, outliers])

# Plot boxplots for each column to visualize the distribution
plt.figure(figsize=(15, 8))
sns.boxplot(data=df_combined)
plt.title("Boxplots of All Columns (with Outliers Removed)")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# remove outlier using quartiles
def handle_outliers_iqr(df, column_name, threshold=1.5):

    # Calculate the first quartile (Q1) and third quartile (Q3)
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    
    # Calculate the IQR (Interquartile Range)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    
    # Find the indices of outliers based on the bounds
    outlier_indices = (df[column_name] < lower_bound) | (df[column_name] > upper_bound)
    
    # Remove outliers from the DataFrame
    df_no_outliers = df[~outlier_indices]
    
    # Store outliers in a separate DataFrame
    outliers = df[outlier_indices]
    
    return df_no_outliers, outliers


In [None]:
# Apply the handle_outliers_iqr function to remove outliers from the 'Age (day)' column
df_no_outliers_age, outliers_age = handle_outliers_iqr(df, 'Age (day)', threshold=3)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize box plot before outlier removal
plt.figure(figsize=(8, 6))
sns.boxplot(data=df['Age (day)'])
plt.title("Boxplot of 'Age (day)' Column Before Outlier Removal")
plt.show()

# Apply the handle_outliers_iqr function to remove outliers from the 'Age (day)' column
df_no_outliers_age, outliers_age = handle_outliers_iqr(df, 'Age (day)', threshold=1)

# Visualize box plot after outlier removal
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_no_outliers_age['Age (day)'])
plt.title("Boxplot of 'Age (day)' Column After Outlier Removal")
plt.show()


In [None]:
# Dropping duplicate values
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
def remove_outliers(X, y, threshold=1.5):
    Q1 = np.percentile(X, 25, axis=0)
    Q3 = np.percentile(X, 75, axis=0)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    mask = np.logical_and(X >= lower_bound, X <= upper_bound)
    mask = mask.all(axis=1)
    return X[mask], y[mask]

In [None]:
# Dependent and Independent Variables
X = df.iloc[:, :-1]
y= df.iloc[:, -1]

In [None]:
X, y = remove_outliers(X, y)

In [None]:
X

In [None]:
y

In [None]:
# segreating numerical variables
# numerical_cols = X.select_dtypes(exclude='object').columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
# Importing necessary library
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# Numerical Pipelines
   
    
num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),

    ]
        )
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

In [None]:
#  Train _test split
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=30)

In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
pip install xgboost

In [None]:
# Training Models
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor()
}

param_grids = {
    'LinearRegression': {},
    'Lasso': {'alpha': [0.01, 0.1, 1, 10, 100]},
    'Ridge': {'alpha': [0.01, 0.1, 1, 10, 100]},
    'ElasticNet': {
        'alpha': [0.01, 0.1, 1, 10, 100],
        'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]},
    'DecisionTree': {'max_depth': [None, 5, 10, 20]},
    'RandomForest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10, 20]},
    'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 10]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 10]}
}

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

# Train and evaluate models with hyperparameter tuning
best_model_info = {'model': None, 'params': None, 'performance': None}

for name, model in models.items():
    print(f"Training {name}...")
    param_grid = param_grids[name]
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)
    
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"{name} Model Performance")
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score", r2_square * 100)
    print('=' * 35)
    print('\n')

    if best_model_info['performance'] is None or r2_square > best_model_info['performance']:
        best_model_info['model'] = best_model
        best_model_info['params'] = grid_search.best_params_
        best_model_info['performance'] = r2_square

# Print the best model and its performance
best_model_name = type(best_model_info['model']).__name__
print(f"Best Model: {best_model_name}")
print("Best Parameters:", best_model_info['params'])
print("Best R2 Score:", best_model_info['performance'] * 100) 


In [45]:
# Save the preprocessor
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

# Save the best model
with open('model.pkl', 'wb') as file:
    pickle.dump(best_model_info['model'], file)