In [None]:
import re
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    MinMaxScaler,
    OrdinalEncoder,
    PolynomialFeatures,
    StandardScaler,
)


In [None]:
file_path = 'laptop_price (1).csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Первичный осмотр данных
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [None]:
# По рекомендации дропнем Product
df = df.drop(columns=['Product'])

# Ram и Weight в гб и кг
df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)


# Memory в 4 столбца: SSD, HDD, Flash Storage и Hybrid
def parse_memory(memory: str) -> pd.Series:
    ssd, hdd, flash, hybrid = 0, 0, 0, 0
    for part in memory.split('+'):
        part = part.strip()
        if 'SSD' in part:
            ssd += int(re.findall(r'\d+', part)[0])
        elif 'HDD' in part:
            hdd += int(re.findall(r'\d+', part)[0])
        elif 'Flash Storage' in part:
            flash += int(re.findall(r'\d+', part)[0])
        elif 'Hybrid' in part:
            hybrid += int(re.findall(r'\d+', part)[0])
    return pd.Series([ssd, hdd, flash, hybrid])


df[['Memory_SSD', 'Memory_HDD', 'Memory_Flash_Storage', 'Memory_Hybrid']] = df[
    'Memory'
].apply(parse_memory)
df = df.drop(columns=['Memory'])


# ScreenResolution в несколько столбцов
def parse_screen_resolution(resolution: str) -> pd.Series:
    touchscreen = 1 if 'Touchscreen' in resolution else 0
    if 'IPS Panel' in resolution:
        panel = 'IPS'
        resolution = resolution.replace('IPS Panel', '').strip()
    else:
        panel = 'Standard'
    res_match = re.search(r'\d+x\d+', resolution)
    res_value = res_match.group(0) if res_match else 'Unknown'
    return pd.Series([res_value, touchscreen, panel])


df[['Resolution', 'Touchscreen', 'Panel_Type']] = df['ScreenResolution'].apply(
    parse_screen_resolution
)
df = df.drop(columns=['ScreenResolution'])


def parse_cpu(cpu: str) -> pd.Series:
    if 'Intel' in cpu:
        manufacturer = 'Intel'
    elif 'AMD' in cpu:
        manufacturer = 'AMD'
    else:
        manufacturer = 'Other'
    # Берём кусок ПОСЛЕ производителя, но ДО GHz
    model = re.sub(r'\s\d+\.?\d*GHz', '', ' '.join(cpu.split()[1:]))
    ghz = (
        float(re.search(r'[\d.]+GHz', cpu).group().replace('GHz', ''))
        if 'GHz' in cpu
        else None
    )
    return pd.Series([manufacturer, model, ghz])


df[['Cpu_Manufacturer', 'Cpu_Model', 'Cpu_Performance']] = df['Cpu'].apply(
    parse_cpu
)
df = df.drop(columns=['Cpu'])

df['Gpu_Manufacturer'] = df['Gpu'].apply(
    lambda x: 'Nvidia'
    if 'Nvidia' in x
    else ('Intel' if 'Intel' in x else ('AMD' if 'AMD' in x else 'Other'))
)
df = df.drop(columns=['Gpu'])

In [None]:
ordinal_cols = [
    'Company',
    'TypeName',
    'Resolution',
    'Panel_Type',
    'Cpu_Manufacturer',
    'Cpu_Model',
    'Gpu_Manufacturer',
    'OpSys',
]
encoder = OrdinalEncoder()
df[ordinal_cols] = encoder.fit_transform(df[ordinal_cols])

df = df.drop(columns=['laptop_ID'])

In [None]:
df_mm = df.copy()
df_ss = df.copy()

target = 'Price_euros'
X = df.drop(columns=[target])
y = df[target]

X_mm = df_mm.drop(columns=[target])
y_mm = df_mm[target]

X_ss = df_ss.drop(columns=[target])
y_ss = df_ss[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train_mm, X_test_mm, y_train_mm, y_test_mm = train_test_split(
    X_mm, y_mm, test_size=0.2, random_state=42
)
X_train_ss, X_test_ss, y_train_ss, y_test_ss = train_test_split(
    X_ss, y_ss, test_size=0.2, random_state=42
)

# Minmax
mm_scaler = MinMaxScaler()
X_train_mm = mm_scaler.fit_transform(X_train_mm)
X_test_mm = mm_scaler.transform(X_test_mm)

# StandScaler
ss_scaler = StandardScaler()
X_train_ss = ss_scaler.fit_transform(X_train_ss)
X_test_ss = ss_scaler.transform(X_test_ss)


In [None]:
def evaluate_model(
    model,
    X_train,
    X_test,
    y_train,
    y_test,
) -> Tuple[float, float]:
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    return rmse_train, rmse_test

In [None]:
datasets = [
    {'scaller': 'default', 'data': (X_train, X_test, y_train, y_test)},
    {
        'scaller': 'min_max',
        'data': (X_train_mm, X_test_mm, y_train_mm, y_test_mm),
    },
    {
        'scaller': 'standart',
        'data': (X_train_ss, X_test_ss, y_train_ss, y_test_ss),
    },
]


In [None]:
results = []

# Обычная регрессия
for dataset in datasets:
    X_train, X_test, y_train, y_test = dataset['data']
    rmse_train, rmse_test = evaluate_model(
        LinearRegression(),
        X_train,
        X_test,
        y_train,
        y_test,
    )

    results_dict = {
        'type': 'default',
        'scaller': dataset['scaller'],
        'rmse': (rmse_train, rmse_test),
    }
    results.append(results_dict)

In [None]:
# Lasso и Ridge регрессии
for dataset in datasets:
    X_train, X_test, y_train, y_test = dataset['data']
    for alpha in [0.01, 0.1, 1, 10]:
        rmse_train, rmse_test = evaluate_model(
            Lasso(alpha=alpha, max_iter=10000),
            X_train,
            X_test,
            y_train,
            y_test,
        )
        results_dict = {
            'type': f'Lassp (alpha = {alpha})',
            'scaller': dataset['scaller'],
            'rmse': (rmse_train, rmse_test),
        }
        results.append(results_dict)

        rmse_train, rmse_test = evaluate_model(
            Ridge(alpha=alpha),
            X_train,
            X_test,
            y_train,
            y_test,
        )
        results_dict = {
            'type': f'Ridge (alpha = {alpha})',
            'scaller': dataset['scaller'],
            'rmse': (rmse_train, rmse_test),
        }
        results.append(results_dict)

In [None]:
# Полиномиальная регрессия
for dataset in datasets:
    X_train, X_test, y_train, y_test = dataset['data']
    for degree in [2, 3]:
        poly = PolynomialFeatures(degree=degree)
        X_train_poly = poly.fit_transform(X_train)
        X_test_poly = poly.transform(X_test)
        rmse_train, rmse_test = evaluate_model(
            LinearRegression(),
            X_train_poly,
            X_test_poly,
            y_train,
            y_test,
        )
        results_dict = {
            'type': f'Polynomial (degree = {degree})',
            'scaller': dataset['scaller'],
            'rmse': (rmse_train, rmse_test),
        }
        results.append(results_dict)


In [None]:
# Топ 5 крутых
sorted_results = sorted(results, key=lambda x: x['rmse'][1])
sorted_results[:5]

[{'type': 'Polynomial (degree = 2)',
  'scaller': 'min_max',
  'rmse': (np.float64(255.86950421537782), np.float64(309.891680189273))},
 {'type': 'Polynomial (degree = 2)',
  'scaller': 'default',
  'rmse': (np.float64(255.86950421537782), np.float64(335.41220079654613))},
 {'type': 'Lassp (alpha = 0.1)',
  'scaller': 'default',
  'rmse': (np.float64(357.7106019598161), np.float64(395.27288276357774))},
 {'type': 'Lassp (alpha = 0.01)',
  'scaller': 'default',
  'rmse': (np.float64(357.7094572156646), np.float64(395.2811633158302))},
 {'type': 'default',
  'scaller': 'default',
  'rmse': (np.float64(357.7094456519461), np.float64(395.2821948742746))}]

In [None]:
# Самые худшие результаты (скорее всего переобучение)
sorted_results[-3:][::-1]

[{'type': 'Polynomial (degree = 3)',
  'scaller': 'min_max',
  'rmse': (np.float64(107.31148765067896), np.float64(1921796156866.4182))},
 {'type': 'Polynomial (degree = 2)',
  'scaller': 'standart',
  'rmse': (np.float64(255.86970578408494), np.float64(228844926156.9895))},
 {'type': 'Polynomial (degree = 3)',
  'scaller': 'default',
  'rmse': (np.float64(107.31883058693037), np.float64(34964878.25135511))}]

### Выводы по применению различных регрессий, нормализации данных и выбору гиперпараметров

Наилучший алгоритм - **Полиномиальная регрессия 2-й степени**, так как она даёт наилучший результат (RMSE Test: 309.89) с **MinMax** нормализацией