In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


def extract_numeric(value):
    if isinstance(value, str):
        return pd.to_numeric("".join(filter(str.isdigit, value)), errors='coerce')
    return value

file_path = r"C:\Users\lenovo\Downloads\archive\Mobiles Dataset (2025).csv"
df = pd.read_csv(file_path, encoding='latin1')


columns_to_clean = ["Mobile Weight", "RAM", "Front Camera", "Back Camera",
                    "Battery Capacity", "Screen Size", "Launched Price (Pakistan)",
                    "Launched Price (India)", "Launched Price (China)",
                    "Launched Price (USA)", "Launched Price (Dubai)"]

df_cleaned = df.copy()
for col in columns_to_clean:
    df_cleaned[col] = df_cleaned[col].apply(extract_numeric)

df_cleaned.drop(columns=["Company Name", "Model Name", "Processor"], inplace=True)


y = df_cleaned["Launched Price (Pakistan)"].dropna()
X = df_cleaned.drop(columns=["Launched Price (Pakistan)"]).loc[y.index]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_const = sm.add_constant(X_train)


model_all = sm.OLS(y_train, X_train_const).fit()
print(model_all.summary())


def backward_elimination(X, y, significance_level=0.05):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    while True:
        p_values = model.pvalues
        max_p_value = p_values.max()
        if max_p_value > significance_level:
            feature_to_remove = p_values.idxmax()
            X = X.drop(columns=[feature_to_remove])
            model = sm.OLS(y, X).fit()
        else:
            break
    return model, X.columns

model_backward, selected_features_backward = backward_elimination(X_train, y_train)
print(model_backward.summary())


def forward_selection(X, y, significance_level=0.05):
    initial_features = []
    remaining_features = list(X.columns)
    best_model = None
    best_features = None

    while remaining_features:
        p_values = {}
        for feature in remaining_features:
            X_temp = sm.add_constant(X[initial_features + [feature]])
            model = sm.OLS(y, X_temp).fit()
            p_values[feature] = model.pvalues[feature]

        best_feature = min(p_values, key=p_values.get)
        if p_values[best_feature] < significance_level:
            initial_features.append(best_feature)
            remaining_features.remove(best_feature)
            best_model = sm.OLS(y, sm.add_constant(X[initial_features])).fit()
            best_features = initial_features[:]
        else:
            break

    return best_model, best_features

model_forward, selected_features_forward = forward_selection(X_train, y_train)
print(model_forward.summary())


                                OLS Regression Results                               
Dep. Variable:     Launched Price (Pakistan)   R-squared:                       0.837
Model:                                   OLS   Adj. R-squared:                  0.835
Method:                        Least Squares   F-statistic:                     342.4
Date:                       Mon, 10 Mar 2025   Prob (F-statistic):          1.23e-279
Time:                               23:31:05   Log-Likelihood:                -8938.5
No. Observations:                        743   AIC:                         1.790e+04
Df Residuals:                            731   BIC:                         1.796e+04
Df Model:                                 11                                         
Covariance Type:                   nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------