In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Load data
df = pd.read_csv('car_price_prediction_.csv')

# Preprocessing
# Drop Car ID and encode categorical variables
df = pd.get_dummies(df.drop('Car ID', axis=1), drop_first=True)

# Split data
X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. No elimination (all variables)
model_all = LinearRegression()
model_all.fit(X_train, y_train)
score_all = model_all.score(X_test, y_test)

# 2. Backward Elimination
def backward_elimination(X, y, threshold=0.05):
    cols = X.columns.tolist()
    for _ in range(len(cols)):
        X_1 = sm.add_constant(pd.DataFrame(X[cols]))
        model = sm.OLS(y, X_1).fit()
        pvals = model.pvalues[1:]  # exclude constant
        max_p = pvals.max()
        if max_p > threshold:
            remove_feature = pvals.idxmax()
            cols.remove(remove_feature)
        else:
            break
    return cols

backward_features = backward_elimination(X_train, y_train)
model_backward = LinearRegression().fit(X_train[backward_features], y_train)
score_backward = model_backward.score(X_test[backward_features], y_test)

# 3. Forward Selection
def forward_selection(X, y, threshold=0.05):
    initial_features = []
    remaining_features = X.columns.tolist()
    while remaining_features:
        pvals = []
        for feature in remaining_features:
            features = initial_features + [feature]
            X_ = sm.add_constant(pd.DataFrame(X[features]))
            pvals.append(sm.OLS(y, X_).fit().pvalues[-1])
        
        min_p = min(pvals)
        if min_p < threshold:
            best_feature = remaining_features[np.argmin(pvals)]
            initial_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    return initial_features

forward_features = forward_selection(X_train, y_train)
model_forward = LinearRegression().fit(X_train[forward_features], y_train)
score_forward = model_forward.score(X_test[forward_features], y_test)

# 4. Combined (Stepwise)
def stepwise_selection(X, y, threshold_in=0.05, threshold_out=0.1):
    included = []
    while True:
        changed = False
        
        # Forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_pval = new_pval.min()
        if min_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            
        # Backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        pvals = model.pvalues[1:]  # exclude constant
        max_pval = pvals.max()
        if max_pval > threshold_out:
            worst_feature = pvals.idxmax()
            included.remove(worst_feature)
            changed = True
            
        if not changed:
            break
    return included

stepwise_features = stepwise_selection(X_train, y_train)
model_stepwise = LinearRegression().fit(X_train[stepwise_features], y_train)
score_stepwise = model_stepwise.score(X_test[stepwise_features], y_test)

# Print results
print(f"All features R²: {score_all:.3f}")
print(f"Backward elimination R²: {score_backward:.3f}")
print(f"Forward selection R²: {score_forward:.3f}")
print(f"Stepwise selection R²: {score_stepwise:.3f}")

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
import pandas as pd 
x=pd.read_csv("car_price_prediction.csv")

: 