In [2]:
# Data manipulation and analysis
import pandas as pd

# Statistical data visualization
import seaborn as sns

# Numerical operations
import numpy as np

# Plotting and visualizations
import matplotlib.pyplot as plt

# Statistical functions and tests
from scipy import stats

# Specific functions for normal distribution and skewness
from scipy.stats import norm, skew

# Data scaling methods
from sklearn.preprocessing import RobustScaler, StandardScaler

# Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Splitting data and hyperparameter tuning
from sklearn.model_selection import train_test_split, GridSearchCV
# Model evaluation metric
from sklearn.metrics import mean_squared_error

# Base classes for building custom transformers and models
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

# Gradient boosting model for regression and classification
import xgboost as xgb

# To suppress warning messages
import warnings
warnings.filterwarnings('ignore')

import sys
print(sys.executable)


c:\Users\Hakan Bilgisayar\Desktop\pyt\data\venv\Scripts\python.exe


In [8]:
column_name = ["MPG", "Cylinders", "Displacement", "Horsepower", "Weight", "Acceleration", "Model Year", "Origin"]
data = pd.read_csv("auto-mpg.data", names = column_name, na_values = "?", comment= "\t", sep = " ", skipinitialspace = True)

In [10]:
data = data.rename(columns = {"MPG":"target"})

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   target        398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB


In [None]:
#DATA AUGMENTATION WITH GAUSSIAN NOISE
import numpy as np
import pandas as pd

def add_gaussian_noise(df, noise_level=0.01, random_state=42):
    np.random.seed(random_state)
    noisy_df = df.copy()
    numeric_cols = noisy_df.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        std = noisy_df[col].std()
        noise = np.random.normal(0, noise_level * std, size=noisy_df.shape[0])
        noisy_df[col] = noisy_df[col] + noise

    return noisy_df


In [None]:
# Generate Augmented Data
augmented_data = add_gaussian_noise(data, noise_level=0.02)

# Combine Original + Augmented Datas 
combined_data = pd.concat([data, augmented_data], ignore_index=True)

# Check the last version
print("Original:", data.shape)
print("Augmented:", augmented_data.shape)
print("Combined:", combined_data.shape)


Original: (398, 8)
Augmented: (398, 8)
Combined: (796, 8)


In [14]:
combined_data.isnull().sum()

target           0
Cylinders        0
Displacement     0
Horsepower      12
Weight           0
Acceleration     0
Model Year       0
Origin           0
dtype: int64