In [None]:
# Setup and target-feature correlation analysis for Laptop Price
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", context="notebook")
plt.rcParams["figure.dpi"] = 120

NOTEBOOK_DIR = Path.cwd()
PROJECT_DIR = NOTEBOOK_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'
RESULTS_DIR = PROJECT_DIR / 'results'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Constants
TARGET_COL = 'Price'

# Load dataset
csv_path = DATA_DIR / 'Laptop Price.csv'
if not csv_path.exists():
    raise FileNotFoundError(f"Expected data at {csv_path}")
df = pd.read_csv(csv_path)

# Ensure target exists and numeric frame
if TARGET_COL not in df.columns:
    # try common variations
    alt = [c for c in df.columns if c.lower().strip() in ['price', 'price_usd']]
    if not alt:
        raise KeyError(f"Target column 'Price' not found. Columns: {list(df.columns)}")
    TARGET_COL = alt[0]

numeric_df = df.select_dtypes(include=[np.number]).copy()
if TARGET_COL not in numeric_df.columns:
    numeric_df[TARGET_COL] = pd.to_numeric(df[TARGET_COL], errors='coerce')

# drop missing target and impute medians
numeric_df = numeric_df.dropna(subset=[TARGET_COL]).reset_index(drop=True)
for col in numeric_df.columns:
    if numeric_df[col].isna().any():
        numeric_df[col] = numeric_df[col].fillna(numeric_df[col].median())

# correlations with target
corr_with_target = numeric_df.corr(method='pearson')[TARGET_COL].sort_values(ascending=False)

# save
corr_df = corr_with_target.reset_index()
corr_df.columns = ['feature', 'pearson_corr_with_target']
full_out = RESULTS_DIR / 'feature_price_correlations.csv'
corr_df.to_csv(full_out, index=False)
print(f"Saved target correlations to {full_out}")

In [None]:
# Inter-feature correlations and potential interaction candidates
corr_matrix = numeric_df.corr(method='pearson')

corr_mat_path = RESULTS_DIR / 'full_correlation_matrix.csv'
corr_matrix.to_csv(corr_mat_path)
print(f"Saved full correlation matrix to {corr_mat_path}")

# high correlation pairs among predictors
PRED_ONLY = [c for c in corr_matrix.columns if c != TARGET_COL]
threshold = 0.8
high_pairs = []
for i, a in enumerate(PRED_ONLY):
    for j, b in enumerate(PRED_ONLY):
        if j <= i:
            continue
        r = corr_matrix.loc[a, b]
        if abs(r) >= threshold:
            high_pairs.append((a, b, r))

import pandas as pd
high_corr_df = pd.DataFrame(high_pairs, columns=['feature_a', 'feature_b', 'pearson_r']).sort_values(by='pearson_r', key=lambda s: s.abs(), ascending=False)
high_pairs_path = RESULTS_DIR / 'high_corr_feature_pairs.csv'
high_corr_df.to_csv(high_pairs_path, index=False)
print(f"Saved high-correlation pairs (|r|>={threshold}) to {high_pairs_path}")

# candidates from top target-correlated features
corr_no_target = corr_with_target.drop(labels=[TARGET_COL])
abs_sorted = corr_no_target.reindex(corr_no_target.abs().sort_values(ascending=False).index)
TOP_FOR_INTERACTIONS = list(abs_sorted.index[:8])
interaction_candidates = []
for i, a in enumerate(TOP_FOR_INTERACTIONS):
    for b in TOP_FOR_INTERACTIONS[i+1:]:
        interaction_candidates.append({'interaction': f'{a}*{b}', 'feat_a': a, 'feat_b': b,
                                       'abs_corr_a': abs_sorted.loc[a], 'abs_corr_b': abs_sorted.loc[b]})
interaction_df = pd.DataFrame(interaction_candidates)
interactions_path = RESULTS_DIR / 'interaction_candidates_from_top_features.csv'
interaction_df.to_csv(interactions_path, index=False)
print(f"Saved interaction candidates to {interactions_path}")

In [None]:
# Visualisations
# Barplot of top target correlations
corr_no_target = corr_with_target.drop(labels=[TARGET_COL])
abs_sorted = corr_no_target.reindex(corr_no_target.abs().sort_values(ascending=False).index)
TOP_N = 15
plot_series = abs_sorted.head(TOP_N)

plt.figure(figsize=(8, 0.4 * TOP_N + 2))
sns.barplot(x=plot_series.values, y=plot_series.index, palette='viridis')
plt.title('Top correlations with Price (absolute)')
plt.xlabel('Pearson correlation')
plt.ylabel('Feature')
plt.tight_layout()
bar_path = RESULTS_DIR / 'top_target_correlations_bar.png'
plt.savefig(bar_path, bbox_inches='tight')
plt.show()
print(f"Saved bar chart to {bar_path}")

# Scatter + trend for top features
TOP_K_SCATTER = 6
top_features = plot_series.index[:TOP_K_SCATTER]
num_cols = 3
num_rows = int(np.ceil(TOP_K_SCATTER / num_cols))
fig, axes = plt.subplots(num_rows, num_cols, figsize=(5 * num_cols, 4 * num_rows))
axes = axes.ravel()
for i, feat in enumerate(top_features):
    ax = axes[i]
    sns.regplot(x=numeric_df[feat], y=numeric_df[TARGET_COL], scatter_kws={'alpha': 0.3, 's': 15}, line_kws={'color': 'red'}, ax=ax)
    ax.set_title(f"{feat} vs {TARGET_COL}")
for j in range(i + 1, len(axes)):
    axes[j].axis('off')
plt.tight_layout()
scatter_path = RESULTS_DIR / 'top_target_scatter_regplots.png'
plt.savefig(scatter_path, bbox_inches='tight')
plt.show()
print(f"Saved scatter/regplot grid to {scatter_path}")

# Heatmap of correlations for top features
TOP_M_HEATMAP = 12
heatmap_feats = list(top_features) + [TARGET_COL]
if len(heatmap_feats) < TOP_M_HEATMAP + 1:
    heatmap_feats = list(abs_sorted.index[:TOP_M_HEATMAP]) + [TARGET_COL]

corr_mat_subset = numeric_df[heatmap_feats].corr()
plt.figure(figsize=(1.1 * len(heatmap_feats), 0.9 * len(heatmap_feats)))
sns.heatmap(corr_mat_subset, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Correlation heatmap: top features and target')
plt.tight_layout()
heatmap_path = RESULTS_DIR / 'corr_heatmap_top_features.png'
plt.savefig(heatmap_path, bbox_inches='tight')
plt.show()
print(f"Saved heatmap to {heatmap_path}")