In [2]:

# Import necessary libraries for data analysis and modeling
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv"
df = pd.read_csv(url)

# Check for missing values in specific columns
missing_values = df[['RAM', 'Storage', 'Screen', 'Final Price']].isnull().sum()
print("Missing Values:\n", missing_values)

# Compute the median for RAM
median_ram = df['RAM'].median()
print("\nMedian RAM:", median_ram)

# Filling NAs with 0 and mean for comparison
fill_0 = df.fillna(0)
fill_mean = df.fillna(df.mean(numeric_only=True))
print("\nFilling NAs - With 0:", fill_0[['RAM', 'Storage', 'Screen', 'Final Price']].isnull().sum())
print("Filling NAs - With Mean:", fill_mean[['RAM', 'Storage', 'Screen', 'Final Price']].isnull().sum())

# Preprocessing: Selecting relevant features for the regression model
df_cleaned = df[['RAM', 'Storage', 'Screen', 'Final Price']].dropna()

# Splitting data into training and testing sets
X = df_cleaned[['RAM', 'Storage', 'Screen']]
y = df_cleaned['Final Price']

# Split the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define different regularization strengths for Ridge regression
alpha_values = [0, 0.01, 1, 10, 100]
rmse_train = []
rmse_test = []

# Loop through each regularization strength, train the model and calculate RMSE
for alpha in alpha_values:
    # Train the Ridge regression model
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    
    # Make predictions for both training and testing sets
    y_train_pred = ridge_model.predict(X_train)
    y_test_pred = ridge_model.predict(X_test)
    
    # Calculate RMSE for both sets
    rmse_train.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
    rmse_test.append(np.sqrt(mean_squared_error(y_test, y_test_pred)))

# Calculate the spread of RMSE and identify specific test RMSE values
rmse_spread = np.ptp(rmse_train)

# Print results
print("\nRMSE on Training set for different alphas:", rmse_train)
print("RMSE on Test set for different alphas:", rmse_test)
print("\nSpread of RMSE:", rmse_spread)


Missing Values:
 RAM            0
Storage        0
Screen         4
Final Price    0
dtype: int64

Median RAM: 16.0

Filling NAs - With 0: RAM            0
Storage        0
Screen         0
Final Price    0
dtype: int64
Filling NAs - With Mean: RAM            0
Storage        0
Screen         0
Final Price    0
dtype: int64

RMSE on Training set for different alphas: [np.float64(586.2867599144708), np.float64(586.286759914526), np.float64(586.2867604665585), np.float64(586.2868146541759), np.float64(586.2917966986677)]
RMSE on Test set for different alphas: [np.float64(609.1751638164043), np.float64(609.1751329303195), np.float64(609.1720771214733), np.float64(609.1444697583147), np.float64(608.8845137833619)]

Spread of RMSE: 0.005036784196818189
