In [3]:
import pandas as pd

# Load the dataset to inspect it
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

# Normalizing the column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Selecting only the relevant columns
selected_columns = ['ram', 'storage', 'screen', 'final_price']
df_selected = df[selected_columns]

# Check for missing values
missing_values = df_selected.isnull().sum()

missing_values


ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [5]:
# Calculating the median for the 'ram' column
ram_median = df_selected['ram'].median()

ram_median


16.0

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Define a variable for the seed value
seed_value = 42

# Shuffle the dataset using the seed value
df_shuffled = df_selected.sample(frac=1, random_state=seed_value).reset_index(drop=True)

# Split the data into 60% train, 20% val, 20% test using the same seed
train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=seed_value)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed_value)


# Fill missing values
# Option 1: Fill missing 'screen' values with 0
train_df_fill_0 = train_df.copy()
val_df_fill_0 = val_df.copy()
train_df_fill_0['screen'] = train_df_fill_0['screen'].fillna(0)
val_df_fill_0['screen'] = val_df_fill_0['screen'].fillna(0)

# Option 2: Fill missing 'screen' values with the mean of 'screen' from training data
mean_screen_train = train_df['screen'].mean()
train_df_fill_mean = train_df.copy()
val_df_fill_mean = val_df.copy()
train_df_fill_mean['screen'] = train_df_fill_mean['screen'].fillna(mean_screen_train)
val_df_fill_mean['screen'] = val_df_fill_mean['screen'].fillna(mean_screen_train)

# Features and target
features = ['ram', 'storage', 'screen']
target = 'final_price'

# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Train linear regression model for Option 1 (fill 0)
lr_fill_0 = LinearRegression()
lr_fill_0.fit(train_df_fill_0[features], train_df_fill_0[target])

# Predict on validation set for Option 1
val_predictions_fill_0 = lr_fill_0.predict(val_df_fill_0[features])

# Calculate RMSE for Option 1
rmse_fill_0 = rmse(val_df_fill_0[target], val_predictions_fill_0)

# Train linear regression model for Option 2 (fill mean)
lr_fill_mean = LinearRegression()
lr_fill_mean.fit(train_df_fill_mean[features], train_df_fill_mean[target])

# Predict on validation set for Option 2
val_predictions_fill_mean = lr_fill_mean.predict(val_df_fill_mean[features])

# Calculate RMSE for Option 2
rmse_fill_mean = rmse(val_df_fill_mean[target], val_predictions_fill_mean)

# Print the RMSE values for both options
print(f"RMSE (fill 0): {rmse_fill_0}")
print(f"RMSE (fill mean): {rmse_fill_mean}")


RMSE (fill 0): 675.0844456600812
RMSE (fill mean): 675.1574065408337


In [7]:
# For fun going to have a look at a variety of other random seeds, to see how that impacts using fill 0 or fill mean

# Re-import 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset to reinitialize it
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

# Normalizing the column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Selecting only the relevant columns
df_selected = df[['ram', 'storage', 'screen', 'final_price']]

# Re-run the code to calculate RMSE for multiple seed values
# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Store results for different seed values
seed_values = range(1, 50, 2)  # A couple of dozen random seeds
results = []

# Loop over different seed values
for seed_value in seed_values:
    # Shuffle the dataset using the current seed value
    df_shuffled = df_selected.sample(frac=1, random_state=seed_value).reset_index(drop=True)

    # Split the data into 60% train, 20% val, 20% test using the same seed
    train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=seed_value)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed_value)

    # Fill missing values
    # Option 1: Fill missing 'screen' values with 0
    train_df_fill_0 = train_df.copy()
    val_df_fill_0 = val_df.copy()
    train_df_fill_0['screen'] = train_df_fill_0['screen'].fillna(0)
    val_df_fill_0['screen'] = val_df_fill_0['screen'].fillna(0)

    # Option 2: Fill missing 'screen' values with the mean of 'screen' from training data
    mean_screen_train = train_df['screen'].mean()
    train_df_fill_mean = train_df.copy()
    val_df_fill_mean = val_df.copy()
    train_df_fill_mean['screen'] = train_df_fill_mean['screen'].fillna(mean_screen_train)
    val_df_fill_mean['screen'] = val_df_fill_mean['screen'].fillna(mean_screen_train)

    # Features and target
    features = ['ram', 'storage', 'screen']
    target = 'final_price'

    # Train linear regression model for Option 1 (fill 0)
    lr_fill_0 = LinearRegression()
    lr_fill_0.fit(train_df_fill_0[features], train_df_fill_0[target])

    # Predict on validation set for Option 1
    val_predictions_fill_0 = lr_fill_0.predict(val_df_fill_0[features])

    # Calculate RMSE for Option 1
    rmse_fill_0 = rmse(val_df_fill_0[target], val_predictions_fill_0)

    # Train linear regression model for Option 2 (fill mean)
    lr_fill_mean = LinearRegression()
    lr_fill_mean.fit(train_df_fill_mean[features], train_df_fill_mean[target])

    # Predict on validation set for Option 2
    val_predictions_fill_mean = lr_fill_mean.predict(val_df_fill_mean[features])

    # Calculate RMSE for Option 2
    rmse_fill_mean = rmse(val_df_fill_mean[target], val_predictions_fill_mean)

    # Append the results for this seed value
    results.append((seed_value, rmse_fill_0, rmse_fill_mean))

# Convert results to a dataframe for easier visualization
results_df = pd.DataFrame(results, columns=['Seed', 'RMSE (Fill 0)', 'RMSE (Fill Mean)'])

# Add an additional column to indicate which RMSE value was smaller for each seed
results_df['Smaller RMSE'] = np.where(results_df['RMSE (Fill 0)'] < results_df['RMSE (Fill Mean)'], 'Fill 0', 'Fill Mean')

# Display the dataframe with the new column
print(results_df)



    Seed  RMSE (Fill 0)  RMSE (Fill Mean) Smaller RMSE
0      1     618.609576        618.553720    Fill Mean
1      3     627.303952        628.254618       Fill 0
2      5     558.340740        558.370248       Fill 0
3      7     562.837982        564.586930       Fill 0
4      9     564.537017        565.981336       Fill 0
5     11     569.376639        571.365598       Fill 0
6     13     604.098106        604.103566       Fill 0
7     15     656.937306        657.396853       Fill 0
8     17     550.107579        550.485878       Fill 0
9     19     585.399182        585.639629       Fill 0
10    21     593.489307        593.604221       Fill 0
11    23     636.888549        638.034440       Fill 0
12    25     607.753345        607.575297    Fill Mean
13    27     619.017633        621.268841       Fill 0
14    29     536.492130        539.331065       Fill 0
15    31     586.137853        586.131145    Fill Mean
16    33     587.942269        589.150009       Fill 0
17    35  

In [9]:
from sklearn.linear_model import Ridge

# List of regularization strengths (r values)
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

# Store results for different r values
ridge_results = []

# Loop over different r values
for r in r_values:
    # Train Ridge regression model (regularized linear regression) with current r value
    ridge_model = Ridge(alpha=r)
    ridge_model.fit(train_df_fill_0[features], train_df_fill_0[target])

    # Predict on validation set
    val_predictions_ridge = ridge_model.predict(val_df_fill_0[features])

    # Calculate RMSE for the Ridge model
    rmse_ridge = rmse(val_df_fill_0[target], val_predictions_ridge)

    # Append the results for this r value
    ridge_results.append((r, round(rmse_ridge, 2)))

# Convert results to a dataframe for easier visualization
ridge_results_df = pd.DataFrame(ridge_results, columns=['r', 'RMSE'])

# Display the results
ridge_results_df


Unnamed: 0,r,RMSE
0,0.0,581.28
1,0.01,581.28
2,0.1,581.28
3,1.0,581.28
4,5.0,581.29
5,10.0,581.3
6,100.0,581.43


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# Load the dataset to reinitialize it
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Select relevant columns
df_selected = df[['ram', 'storage', 'screen', 'final_price']]

# Initialize variables
seeds = range(10)  # Seeds from 0 to 9
rmse_scores = []

# Loop through different seeds
for seed in seeds:
    # Shuffle the dataset with the current seed
    df_shuffled = df_selected.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # Split the data into 60% train, 20% val, 20% test
    train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=seed)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

    # Fill missing 'screen' values with 0
    train_df['screen'] = train_df['screen'].fillna(0)
    val_df['screen'] = val_df['screen'].fillna(0)

    # Features and target
    features = ['ram', 'storage', 'screen']
    target = 'final_price'

    # Train linear regression model
    lr = LinearRegression()
    lr.fit(train_df[features], train_df[target])

    # Predict on validation set
    val_predictions = lr.predict(val_df[features])

    # Calculate RMSE and store the score
    score = np.sqrt(mean_squared_error(val_df[target], val_predictions))
    rmse_scores.append(score)

# Calculate the standard deviation of the RMSE scores
std_rmse = np.std(rmse_scores)

# Round the standard deviation to 3 decimal places
std_rmse_rounded = round(std_rmse, 3)

print(f"Standard Deviation of RMSE: {std_rmse_rounded}")


Standard Deviation of RMSE: 23.888


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset to reinitialize it
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Select relevant columns
df_selected = df[['ram', 'storage', 'screen', 'final_price']]

# Initialize variables
seeds = range(10)  # Seeds from 0 to 9
rmse_scores = []

# Loop through different seeds
for seed in seeds:
    # Shuffle the dataset with the current seed
    df_shuffled = df_selected.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # Split the data into 60% train, 20% val, 20% test
    train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=seed)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

    # Fill missing 'screen' values with 0
    train_df['screen'] = train_df['screen'].fillna(0)
    val_df['screen'] = val_df['screen'].fillna(0)

    # Features and target
    features = ['ram', 'storage', 'screen']  # Exclude 'final_price' from features
    target = 'final_price'

    # Train linear regression model
    lr = LinearRegression()
    lr.fit(train_df[features], train_df[target])

    # Predict on validation set
    val_predictions = lr.predict(val_df[features])

    # Calculate RMSE and store the score
    score = np.sqrt(mean_squared_error(val_df[target], val_predictions))
    rmse_scores.append(score)

# Calculate the standard deviation of the RMSE scores
std_rmse = np.std(rmse_scores)

# Round the standard deviation to 3 decimal places
std_rmse_rounded = round(std_rmse, 3)

# Print RMSE scores and standard deviation
print(f"RMSE scores: {rmse_scores}")
print(f"Standard Deviation of RMSE: {std_rmse_rounded}")

RMSE scores: [614.8454332579494, 618.6095758134323, 597.3567548889578, 627.3039515368521, 575.7847095764524, 558.3407402558894, 595.6817784944298, 562.8379817455949, 575.4870096345064, 564.5370166730077]
Standard Deviation of RMSE: 23.888


In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset to reinitialize it
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Select relevant columns
df_selected = df[['ram', 'storage', 'screen', 'final_price']]

# Initialize variables
seeds = range(10)  # Seeds from 0 to 9
rmse_scores = []

# Loop through different seeds
for seed in seeds:
    # Shuffle the dataset with the current seed
    df_shuffled = df_selected.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # Split the data into 60% train, 20% val, 20% test
    train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=seed)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

    # Fill missing 'screen' values with 0
    train_df['screen'] = train_df['screen'].fillna(0)
    val_df['screen'] = val_df['screen'].fillna(0)

    # Features and target
    features = ['ram', 'storage', 'screen']  # Exclude 'final_price' from features
    target = 'final_price'

    # Print the shape of each set
    print(f"Seed: {seed}")
    print(f"Training set size: {train_df.shape}, Validation set size: {val_df.shape}")

    # Train linear regression model
    lr = LinearRegression()
    lr.fit(train_df[features], train_df[target])

    # Predict on validation set
    val_predictions = lr.predict(val_df[features])

    # Calculate RMSE and store the score
    score = np.sqrt(mean_squared_error(val_df[target], val_predictions))
    rmse_scores.append(score)

    # Print RMSE for the current seed
    print(f"RMSE for seed {seed}: {score}")

# Calculate the standard deviation of the RMSE scores
std_rmse = np.std(rmse_scores)

# Round the standard deviation to 3 decimal places
std_rmse_rounded = round(std_rmse, 3)

# Print RMSE scores and standard deviation
print(f"RMSE scores: {rmse_scores}")
print(f"Standard Deviation of RMSE: {std_rmse_rounded}")


Seed: 0
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 0: 614.8454332579494
Seed: 1
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 1: 618.6095758134323
Seed: 2
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 2: 597.3567548889578
Seed: 3
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 3: 627.3039515368521
Seed: 4
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 4: 575.7847095764524
Seed: 5
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 5: 558.3407402558894
Seed: 6
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 6: 595.6817784944298
Seed: 7
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 7: 562.8379817455949
Seed: 8
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE for seed 8: 575.4870096345064
Seed: 9
Training set size: (1296, 4), Validation set size: (432, 4)
RMSE 

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


# Load the dataset to reinitialize it
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Select relevant columns
df_selected = df[['ram', 'storage', 'screen', 'final_price']]

# Use seed 9 for splitting the dataset
seed = 9

# Shuffle the dataset with the current seed
df_shuffled = df_selected.sample(frac=1, random_state=seed).reset_index(drop=True)

# Split the data into 60% train, 20% val, 20% test
train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

# Combine train and validation datasets
combined_df = pd.concat([train_df, val_df])

# Fill missing 'screen' values with 0
combined_df['screen'] = combined_df['screen'].fillna(0)
test_df['screen'] = test_df['screen'].fillna(0)

# Features and target
features = ['ram', 'storage', 'screen']
target = 'final_price'

# Train Ridge regression model with regularization parameter r=0.001
ridge_model = Ridge(alpha=0.001)
ridge_model.fit(combined_df[features], combined_df[target])

# Predict on test set
test_predictions = ridge_model.predict(test_df[features])

# Calculate RMSE on the test dataset
test_rmse = np.sqrt(mean_squared_error(test_df[target], test_predictions))

# Round the RMSE to 3 decimal places
test_rmse_rounded = round(test_rmse, 3)

print(f"RMSE on the test dataset: {test_rmse_rounded}")


RMSE on the test dataset: 552.856
