# Linear Regression

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_regression
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# 1. Load and explore data
data = pd.read_csv('gt_2015.csv')

# Basic EDA
print("Data Shape:", data.shape)
print("\nFirst 5 rows:\n", data.head())
print("\nData Types:\n", data.dtypes)
print("\nMissing Values:\n", data.isnull().sum())

#  Data Cleaning
# Handle missing values
if data.isnull().any().any():
    data = data.fillna(method='ffill').fillna(method='bfill')

# Outlier treatment using IQR
numeric_cols = data.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    data = data[(data[col] >= Q1 - 1.5*IQR) & (data[col] <= Q3 + 1.5*IQR)]

# 3. Feature Engineering
X = data.drop(columns=['NOX'])
y = data['NOX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform target variable
pt = PowerTransformer(method='yeo-johnson')
y_train_trans = pt.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_trans = pt.transform(y_test.values.reshape(-1, 1)).flatten()

# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Feature selection
selector = SelectKBest(score_func=f_regression, k=20)
X_train_selected = selector.fit_transform(X_train_poly, y_train_trans)
X_test_selected = selector.transform(X_test_poly)

# 4. Model Optimization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Ridge regression with hyperparameter tuning
ridge_params = {'alpha': np.logspace(-3, 3, 20)}
ridge_grid = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='r2')
ridge_grid.fit(X_train_scaled, y_train_trans)

# Make predictions
best_ridge = ridge_grid.best_estimator_
y_pred_ridge = pt.inverse_transform(best_ridge.predict(X_test_scaled).reshape(-1, 1)).flatten()

# Calculate metrics
r2 = r2_score(y_test, y_pred_ridge)
mse = mean_squared_error(y_test, y_pred_ridge)
mae = mean_absolute_error(y_test, y_pred_ridge)
rmse = np.sqrt(mse)

print(f"\nRidge Regression Results:")
print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Best Alpha: {ridge_grid.best_params_['alpha']:.4f}")


Data Shape: (7384, 11)

First 5 rows:
         AT      AP      AH    AFDP    GTEP     TIT     TAT     TEY     CDP  \
0  1.95320  1020.1  84.985  2.5304  20.116  1048.7  544.92  116.27  10.799   
1  1.21910  1020.1  87.523  2.3937  18.584  1045.5  548.50  109.18  10.347   
2  0.94915  1022.2  78.335  2.7789  22.264  1068.8  549.95  125.88  11.256   
3  1.00750  1021.7  76.942  2.8170  23.358  1075.2  549.63  132.21  11.702   
4  1.28580  1021.6  76.732  2.8377  23.483  1076.2  549.68  133.58  11.737   

       CO      NOX  
0  7.4491  113.250  
1  6.4684  112.020  
2  3.6335   88.147  
3  3.1972   87.078  
4  2.3833   82.515  

Data Types:
 AT      float64
AP      float64
AH      float64
AFDP    float64
GTEP    float64
TIT     float64
TAT     float64
TEY     float64
CDP     float64
CO      float64
NOX     float64
dtype: object

Missing Values:
 AT      0
AP      0
AH      0
AFDP    0
GTEP    0
TIT     0
TAT     0
TEY     0
CDP     0
CO      0
NOX     0
dtype: int64

Ridge Regression Res

# Decision Tree Regression

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_regression
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# 1. Load and explore data
data = pd.read_csv('gt_2015.csv')

# 2. Data Cleaning
# Handle missing values
if data.isnull().any().any():
    data = data.fillna(method='ffill').fillna(method='bfill')

# Outlier treatment using IQR
numeric_cols = data.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    data = data[(data[col] >= Q1 - 1.5*IQR) & (data[col] <= Q3 + 1.5*IQR)]

# 3. Feature Engineering
X = data.drop(columns=['NOX'])
y = data['NOX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform skewed target variable
pt = PowerTransformer(method='yeo-johnson')
y_train_trans = pt.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_trans = pt.transform(y_test.values.reshape(-1, 1)).flatten()

# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Feature selection
selector = SelectKBest(score_func=f_regression, k=20)
X_train_selected = selector.fit_transform(X_train_poly, y_train_trans)
X_test_selected = selector.transform(X_test_poly)

# 4. Decision Tree Model with Hyperparameter Tuning
# Define parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearch
dt_reg = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(estimator=dt_reg, param_grid=param_grid, 
                          cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train_selected, y_train_trans)

# Best model evaluation
best_dt = grid_search.best_estimator_
y_pred_trans = best_dt.predict(X_test_selected)

# Inverse transform predictions
y_pred = pt.inverse_transform(y_pred_trans.reshape(-1, 1)).flatten()

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("\nDecision Tree Regression Results:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


Decision Tree Regression Results:
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10}
R² Score: 0.7587
MSE: 15.4632
RMSE: 3.9323
MAE: 2.9716


In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# 1. Load and preprocess data
data = pd.read_csv('gt_2015.csv')

# Data cleaning
if data.isnull().any().any():
    data = data.fillna(method='ffill').fillna(method='bfill')

numeric_cols = data.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    data = data[(data[col] >= Q1 - 1.5*IQR) & (data[col] <= Q3 + 1.5*IQR)]

# Feature engineering
X = data.drop(columns=['NOX'])
y = data['NOX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Advanced feature engineering
poly = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

selector = SelectKBest(score_func=f_regression, k=30)
X_train_selected = selector.fit_transform(X_train_poly, y_train)
X_test_selected = selector.transform(X_test_poly)

# 2. Hyperparameter tuning with Random Forest
param_grid = {
    'n_estimators': [300, 400],
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [False]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(rf, param_grid, cv=5, 
                         scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train_selected, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_selected)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best Parameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 400}
R² Score: 0.8688
MSE: 8.4047
RMSE: 2.8991
MAE: 2.1587


# Support Vector Regression (SVR)

In [67]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

df = pd.read_csv('gt_2015.csv')

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()

svr = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr.fit(X_train_scaled, y_train_scaled)

y_pred_scaled = svr.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
y_test_original = y_test

mse = mean_squared_error(y_test_original, y_pred)
mae = mean_absolute_error(y_test_original, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_original, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f"RMSE: {rmse}")
print(f'R-squared: {r2}')

Mean Squared Error: 8.503843596829473
Mean Absolute Error: 1.8997867661823016
RMSE: 2.916135044340278
R-squared: 0.9333157447350111
