# Importing Basic Dependencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('laptop_data.csv')
df.head()

In [None]:
df.columns

In [None]:
# removing the unnamed: 0 col

df = df[['Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']]
df.head()

In [None]:
df.isnull().sum()

In [None]:
# checking for duplicated rows

df.duplicated().sum()

In [None]:
df.info()

In [None]:
laptop_data = df

In [None]:
# Convert 'Ram' and 'Weight' to numerical values
laptop_data['Ram'] = laptop_data['Ram'].str.replace('GB', '').astype(int)
laptop_data['Weight'] = laptop_data['Weight'].str.replace('kg', '').astype(float)

# Check for missing values
missing_values = laptop_data.isnull().sum()

# Updated dataset and missing values info
updated_dataset_info = {
    "First Five Rows": laptop_data.head(),
    "Missing Values": missing_values
}

updated_dataset_info

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting the aesthetic style of the plots
sns.set(style="whitegrid")

# Creating subplots for various visualizations
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))

# Distribution of RAM
sns.histplot(laptop_data['Ram'], kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of RAM')

# Distribution of Inches (Screen Size)
sns.histplot(laptop_data['Inches'], kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Distribution of Screen Size (Inches)')

# Distribution of Weight
sns.histplot(laptop_data['Weight'], kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Distribution of Weight')

# Distribution of Price
sns.histplot(laptop_data['Price'], kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Distribution of Price')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the figure for multiple plots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15))

# Plotting count distribution of categorical variables
sns.countplot(y='Company', data=laptop_data, order = laptop_data['Company'].value_counts().index, ax=axes[0, 0])
axes[0, 0].set_title('Count of Laptops by Company')

sns.countplot(y='TypeName', data=laptop_data, order = laptop_data['TypeName'].value_counts().index, ax=axes[0, 1])
axes[0, 1].set_title('Count of Laptops by TypeName')

sns.countplot(y='OpSys', data=laptop_data, order = laptop_data['OpSys'].value_counts().index, ax=axes[1, 0])
axes[1, 0].set_title('Count of Laptops by Operating System')

# Due to a large number of unique values, we'll limit the CPU and GPU plots to the top 10
top_cpus = laptop_data['Cpu'].value_counts().index[:10]
sns.countplot(y='Cpu', data=laptop_data[laptop_data['Cpu'].isin(top_cpus)], ax=axes[1, 1])
axes[1, 1].set_title('Top 10 CPUs in Laptops')

top_gpus = laptop_data['Gpu'].value_counts().index[:10]
sns.countplot(y='Gpu', data=laptop_data[laptop_data['Gpu'].isin(top_gpus)], ax=axes[2, 0])
axes[2, 0].set_title('Top 10 GPUs in Laptops')

# Adjust layout
plt.tight_layout()
plt.show()

# Due to the complexity and variety in ScreenResolution, we'll need a different approach to analyze it effectively.
# Let's display the top 10 most common screen resolutions.
top_screen_resolutions = laptop_data['ScreenResolution'].value_counts().head(10)
top_screen_resolutions

In [None]:
# Feature Engineering

# 1. Creating the Touchscreen feature
laptop_data['Touchscreen'] = laptop_data['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)

# 2. Creating the IPS feature
laptop_data['IPS'] = laptop_data['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

# 3. Extracting X and Y Resolution
# First, we extract the resolution part from the ScreenResolution column
resolution = laptop_data['ScreenResolution'].str.extract('(\d+)x(\d+)')
# Then we create new columns for X_Res and Y_Res
laptop_data['X_Res'] = resolution[0].astype(int)
laptop_data['Y_Res'] = resolution[1].astype(int)

# Displaying the first few rows of the updated dataset
laptop_data.head()


In [None]:
df = laptop_data

In [None]:
df['IPS'] = df['ScreenResolution'].apply(
    lambda element:1 if "IPS" in element else 0
)
df.sample(5)

In [None]:
laptop_data = df

In [None]:
# Function to calculate PPI
def calculate_ppi(row):
    return np.sqrt(row['X_Res']**2 + row['Y_Res']**2) / row['Inches']

# Creating the PPI column
laptop_data['PPI'] = laptop_data.apply(calculate_ppi, axis=1)

# Dropping the X_Res and Y_Res columns
laptop_data.drop(columns=['X_Res', 'Y_Res'], inplace=True)

# Checking the first few rows of the updated dataset
laptop_data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Selecting numeric features
numeric_data = laptop_data.select_dtypes(include=[np.number])

# Calculating the correlation matrix
corr_matrix = numeric_data.corr()

# Plotting the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numeric Features')
plt.show()

In [None]:
corr_matrix = numeric_data.corr()['Price']
corr_matrix 

In [None]:
# Displaying the column names of the dataset
column_names = laptop_data.columns
column_names

In [None]:
# Extracting CPU name
laptop_data['CPU_Name'] = laptop_data['Cpu'].str.extract(r'(\bIntel\b.*?|\bAMD\b.*?)\s+\d')

# Checking the first few entries of the new column
print(laptop_data['CPU_Name'].head())

In [None]:
# Dropping the original Cpu column
laptop_data.drop('Cpu', axis=1, inplace=True)

In [None]:
laptop_data.head(10)

In [None]:
import re

# Function to convert storage size to integer in GB
def size_to_gb(size_str):
    size_match = re.search(r'(\d+\.?\d*)', size_str)
    if size_match:
        size = float(size_match.group(1))
        if 'TB' in size_str:
            return int(size * 1024)  # Convert TB to GB
        return int(size)
    return 0

# Function to update the storage columns
def update_storage(row):
    storage_entries = row['Memory'].split("+")
    for entry in storage_entries:
        if 'HDD' in entry:
            row['HDD'] += size_to_gb(entry)
        elif 'SSD' in entry:
            row['SSD'] += size_to_gb(entry)
        elif 'Flash Storage' in entry:
            row['Flash Storage'] += size_to_gb(entry)
        elif 'Hybrid' in entry:
            row['Hybrid'] += size_to_gb(entry)
    return row

# Initialize new columns for storage types
laptop_data['HDD'] = 0
laptop_data['SSD'] = 0
laptop_data['Flash Storage'] = 0
laptop_data['Hybrid'] = 0

# Apply the function to each row
laptop_data = laptop_data.apply(update_storage, axis=1)

# Display the first few rows to verify the changes
laptop_data.sample(5)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Selecting numeric features
numeric_data = laptop_data.select_dtypes(include=[np.number])

# Calculating the correlation matrix
corr_matrix = numeric_data.corr()

# Plotting the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numeric Features')
plt.show()

In [None]:
corr_matrix = numeric_data.corr()['Price']
corr_matrix 

In [None]:
# Function to categorize the operating system
def categorize_os(os_str):
    if 'Windows' in os_str:
        return 'Windows'
    elif 'Linux' in os_str or 'Ubuntu' in os_str:
        return 'Linux'
    elif 'macOS' in os_str or 'Mac OS' in os_str:
        return 'Apple'
    else:
        return 'Other OS'

# Creating the new column
laptop_data['OpSys_Simple'] = laptop_data['OpSys'].apply(categorize_os)

# Display the first few rows to verify the changes
print(laptop_data[['OpSys', 'OpSys_Simple']].head())

In [None]:
laptop_data.head()

In [None]:
# Dropping the specified columns
columns_to_drop = ['ScreenResolution', 'Memory', 'Gpu', 'Flash Storage', 'Hybrid', 'Memory Size']
laptop_data.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Displaying the first few rows of the updated dataset
laptop_data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Selecting numeric features
numeric_data = laptop_data.select_dtypes(include=[np.number])

# Calculating the correlation matrix
corr_matrix = numeric_data.corr()

# Plotting the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numeric Features')
plt.show()

In [None]:
corr_matrix = numeric_data.corr()['Price']
corr_matrix 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define target and features
X = laptop_data.drop('Price', axis=1)
y = np.log(laptop_data['Price']) # Log normalization of the Price

# Handling categorical variables
categorical_features = X.select_dtypes(include=['object']).columns
column_transformer = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough')

X_transformed = column_transformer.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# Evaluating Linear Regression
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
print("Linear Regression MAE:", lr_mae)
print("Linear Regression R²:", lr_r2)

In [None]:
from sklearn.linear_model import Ridge

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_predictions = ridge_model.predict(X_test)

# Evaluating Ridge Regression
ridge_mae = mean_absolute_error(y_test, ridge_predictions)
ridge_r2 = r2_score(y_test, ridge_predictions)
print("Ridge Regression MAE:", ridge_mae)
print("Ridge Regression R²:", ridge_r2)

In [None]:
from sklearn.linear_model import Lasso

# Lasso
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
lasso_predictions = lasso_model.predict(X_test)

# Evaluating Lasso
lasso_mae = mean_absolute_error(y_test, lasso_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)
print("Lasso MAE:", lasso_mae)
print("Lasso R²:", lasso_r2)

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

# Evaluating Decision Tree
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)
print("Decision Tree MAE:", dt_mae)
print("Decision Tree R²:", dt_r2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluating Random Forest
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
print("Random Forest MAE:", rf_mae)
print("Random Forest R²:", rf_r2)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# Adjusted parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],  # Replaced 'auto' with 'sqrt'
    'max_depth': [10, 20, 30]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

In [None]:
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

# Plotting for Decision Tree (example)
decision_tree = models['Decision Tree']
path = decision_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Plotting the ccp_alpha vs accuracy
train_scores = []
test_scores = []
for ccp_alpha in ccp_alphas:
    dt = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    dt.fit(X_train, y_train)
    train_scores.append(dt.score(X_train, y_train))
    test_scores.append(dt.score(X_test, y_test))

plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas, train_scores, marker='o', label='train', drawstyle="steps-post")
plt.plot(ccp_alphas, test_scores, marker='o', label='test', drawstyle="steps-post")
plt.xlabel('alpha')
plt.ylabel('accuracy')
plt.title('Accuracy vs alpha for training and testing sets')
plt.legend()
plt.show()

# Plotting the tree
plt.figure(figsize=(20,10))
plot_tree(decision_tree, filled=True, max_depth=3)
plt.show()

In [None]:
# Predicting using the best Random Forest model
laptop_data['Predicted Price'] = np.exp(best_rf_model.predict(X_transformed)) # Inverse of log transformation

# Plotting actual vs predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(laptop_data['Price'], laptop_data['Predicted Price'], alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual Price vs Predicted Price')
plt.plot([laptop_data['Price'].min(), laptop_data['Price'].max()], [laptop_data['Price'].min(), laptop_data['Price'].max()], 'k--')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting the density plot for actual and predicted prices using updated seaborn functions
plt.figure(figsize=(10, 6))
sns.kdeplot(laptop_data['Price'], fill=True, label='Actual Price')
sns.kdeplot(laptop_data['Predicted Price'], fill=True, label='Predicted Price')

plt.title('Density Plot of Actual Price vs Predicted Price')
plt.xlabel('Price')
plt.ylabel('Density')
plt.legend()

plt.show()

In [None]:
import pickle

# Save the model to a file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

In [None]:
import pandas as pd
from scipy.sparse import csr_matrix

# Assuming X_train is your csr_matrix
if isinstance(X_train, csr_matrix):
    X_train_df = pd.DataFrame(X_train.toarray())
else:
    X_train_df = pd.DataFrame(X_train)

# Save DataFrame to CSV
X_train_df.to_csv('train_features.csv', index=False)

In [None]:
y_train.to_csv('train_labels.csv', index=False)

In [None]:
pd.DataFrame(y_train).to_csv('train_labels.csv', index=False)

### $End$