# Data Loading and Preprocessing

In [5]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

In [6]:
# Load the dataset from UCI ML Repository
concrete = fetch_ucirepo(id=165) 

# Get features and target
X = concrete.data.features  # Features
y = concrete.data.targets   # Target variable

print("Dataset Information:")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print("\nFeature names:", list(X.columns))
print("\nMetadata:")
print(concrete.metadata)

Dataset Information:
Number of samples: 1030
Number of features: 8

Feature names: ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age']

Metadata:
{'uci_id': 165, 'name': 'Concrete Compressive Strength', 'repository_url': 'https://archive.ics.uci.edu/dataset/165/concrete+compressive+strength', 'data_url': 'https://archive.ics.uci.edu/static/public/165/data.csv', 'abstract': 'Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. ', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 1030, 'num_features': 8, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Concrete compressive strength'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Sun Feb 11 2024', 'dataset_doi': '10.2443

In [7]:
# First few rows and basic statistics
print("First few rows of the dataset:")
print(X.head())
print("\nBasic statistics of features:")
print(X.describe())
print("\nTarget variable statistics:")
print(y.describe())

First few rows of the dataset:
   Cement  Blast Furnace Slag  Fly Ash  Water  Superplasticizer  \
0   540.0                 0.0      0.0  162.0               2.5   
1   540.0                 0.0      0.0  162.0               2.5   
2   332.5               142.5      0.0  228.0               0.0   
3   332.5               142.5      0.0  228.0               0.0   
4   198.6               132.4      0.0  192.0               0.0   

   Coarse Aggregate  Fine Aggregate  Age  
0            1040.0           676.0   28  
1            1055.0           676.0   28  
2             932.0           594.0  270  
3             932.0           594.0  365  
4             978.4           825.5  360  

Basic statistics of features:
            Cement  Blast Furnace Slag      Fly Ash        Water  \
count  1030.000000         1030.000000  1030.000000  1030.000000   
mean    281.167864           73.895825    54.188350   181.567282   
std     104.506364           86.279342    63.997004    21.354219   
min  

In [8]:
# Import preprocessing packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib  

# Scaling of the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (721, 8)
Testing set shape: (309, 8)


In [9]:
import os
os.makedirs('../data', exist_ok=True)

# Save the preprocessed data and scaler
data_path = '../data/'
joblib.dump(scaler, f'{data_path}scaler.joblib')
np.savez(f'{data_path}processed_data.npz', 
         X_train=X_train, X_test=X_test,
         y_train=y_train, y_test=y_test)

print("Data and scaler have been saved successfully!")

Data and scaler have been saved successfully!
