In [None]:
import pandas as pd

df = pd.read_csv('laptops.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')


In [None]:
df = df[['ram', 'storage', 'screen', 'final_price']]


In [None]:
df.isnull().sum()


In [None]:
df['ram'].median()


In [None]:
from sklearn.model_selection import train_test_split

df = df.sample(frac=1, random_state=42)  # Shuffle the dataset with seed 42
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

def train_and_evaluate(df_train, df_val, fill_value):
    X_train = df_train.drop(columns=['final_price'])
    y_train = df_train['final_price']

    X_val = df_val.drop(columns=['final_price'])
    y_val = df_val['final_price']

    # Fill missing values
    X_train['storage'] = X_train['storage'].fillna(fill_value)
    X_val['storage'] = X_val['storage'].fillna(fill_value)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

# Option 1: Fill with 0
rmse_0 = train_and_evaluate(df_train, df_val, 0)

# Option 2: Fill with mean
mean_storage = df_train['storage'].mean()
rmse_mean = train_and_evaluate(df_train, df_val, mean_storage)

print(f"RMSE with 0: {rmse_0:.2f}")
print(f"RMSE with mean: {rmse_mean:.2f}")


In [None]:
from sklearn.linear_model import Ridge

def train_and_evaluate_ridge(df_train, df_val, fill_value, r):
    X_train = df_train.drop(columns=['final_price'])
    y_train = df_train['final_price']

    X_val = df_val.drop(columns=['final_price'])
    y_val = df_val['final_price']

    # Fill missing values
    X_train['storage'] = X_train['storage'].fillna(fill_value)
    X_val['storage'] = X_val['storage'].fillna(fill_value)

    model = Ridge(alpha=r)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_results = {r: train_and_evaluate_ridge(df_train, df_val, 0, r) for r in r_values}

for r, rmse in rmse_results.items():
    print(f"RMSE for r={r}: {rmse:.2f}")


In [None]:
seeds = range(10)
rmse_scores = []

for seed in seeds:
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
    df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=seed)
    rmse = train_and_evaluate(df_train, df_val, 0)
    rmse_scores.append(rmse)

std_rmse = np.std(rmse_scores)
print(f"Standard deviation of RMSE: {std_rmse:.3f}")


In [None]:
# Combine train and validation sets
df_full_train = pd.concat([df_train, df_val])

rmse_test = train_and_evaluate_ridge(df_full_train, df_test, 0, 0.001)
print(f"RMSE on test set: {rmse_test:.2f}")
