In [1]:
import pandas as pd

df = pd.read_csv("Global_Mobile_Prices_2025_Extended.csv")
df.head()


Unnamed: 0,brand,model,price_usd,ram_gb,storage_gb,camera_mp,battery_mah,display_size_inch,charging_watt,5g_support,os,processor,rating,release_month,year
0,Oppo,A98 111,855,16,128,108,6000,6.6,33,Yes,Android,Helio G99,3.8,February,2025
1,Realme,11 Pro+ 843,618,6,128,64,4500,6.9,100,Yes,Android,Tensor G4,4.4,August,2025
2,Xiaomi,Redmi Note 14 Pro 461,258,16,64,64,4000,6.8,44,Yes,Android,A18 Pro,4.1,March,2025
3,Vivo,V29e 744,837,6,512,48,4500,6.0,65,Yes,Android,Exynos 2400,4.1,August,2025
4,Apple,iPhone 16 Pro Max 927,335,12,128,200,5000,6.9,100,Yes,iOS,Dimensity 9300,3.5,February,2025


In [2]:
# Function to categorize price into segments
def price_category(x):
    if x <= 500:
        return "Budget"
    elif x <= 900:
        return "Mid-Range"
    elif x <= 1200:
        return "Premium"
    else:
        return "Ultra-Premium"

# Add price segment column
df["price_segment"] = df["price_usd"].apply(price_category)

# Create summary: Brand counts
brand_counts = df.groupby("brand").size().reset_index(name="model_count")

# Create summary: Average price by brand
avg_price = df.groupby("brand")["price_usd"].mean().reset_index(name="avg_price_usd")

# Optional: Remove duplicated rows
df_cleaned = df.drop_duplicates()

# Export the CSV files
brand_counts.to_csv("brand_counts.csv", index=False)
avg_price.to_csv("avg_price_brand.csv", index=False)
df_cleaned.to_csv("cleaned_main_data.csv", index=False)

"Exported Successfully."


'Exported Successfully.'

In [3]:
# Select feature columns and target
features = ['brand', 'ram_gb', 'storage_gb', 'camera_mp', 'battery_mah',
            'display_size_inch', 'charging_watt', '5g_support', 'os', 'processor', 'rating']
target = 'price_usd'

X = df[features]   # independent variables
y = df[target]     # target variable

X.head()


Unnamed: 0,brand,ram_gb,storage_gb,camera_mp,battery_mah,display_size_inch,charging_watt,5g_support,os,processor,rating
0,Oppo,16,128,108,6000,6.6,33,Yes,Android,Helio G99,3.8
1,Realme,6,128,64,4500,6.9,100,Yes,Android,Tensor G4,4.4
2,Xiaomi,16,64,64,4000,6.8,44,Yes,Android,A18 Pro,4.1
3,Vivo,6,512,48,4500,6.0,65,Yes,Android,Exynos 2400,4.1
4,Apple,12,128,200,5000,6.9,100,Yes,iOS,Dimensity 9300,3.5


In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# List of categorical columns
categorical_cols = ['brand', '5g_support', 'os', 'processor']

# One-hot encoding for categorical features
ct = ColumnTransformer(
    [('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

X_encoded = ct.fit_transform(X)
X_encoded


array([[  0. ,   0. ,   0. , ...,   6.6,  33. ,   3.8],
       [  0. ,   0. ,   0. , ...,   6.9, 100. ,   4.4],
       [  0. ,   0. ,   0. , ...,   6.8,  44. ,   4.1],
       ...,
       [  0. ,   0. ,   0. , ...,   5.7, 120. ,   4.8],
       [  0. ,   0. ,   0. , ...,   7. ,  65. ,   4.6],
       [  0. ,   0. ,   0. , ...,   5.8,  18. ,   3.9]], shape=(1000, 28))

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


((800, 28), (200, 28))

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
lr_predictions = lr_model.predict(X_test)

# Evaluate the model
lr_r2 = r2_score(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))

lr_r2, lr_mae, lr_rmse


(-0.07289863061193524, 372.4792333888319, np.float64(431.5284077866328))

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
rf_predictions = rf_model.predict(X_test)

# Evaluate
rf_r2 = r2_score(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))

rf_r2, rf_mae, rf_rmse


(-0.08732675614282948, 373.9156, np.float64(434.4202716788202))

In [8]:
print("Linear Regression:")
print("R2 Score:", lr_r2)
print("MAE:", lr_mae)
print("RMSE:", lr_rmse)

print("\nRandom Forest:")
print("R2 Score:", rf_r2)
print("MAE:", rf_mae)
print("RMSE:", rf_rmse)


Linear Regression:
R2 Score: -0.07289863061193524
MAE: 372.4792333888319
RMSE: 431.5284077866328

Random Forest:
R2 Score: -0.08732675614282948
MAE: 373.9156
RMSE: 434.4202716788202


In [9]:
import pandas as pd
import numpy as np

# Get feature names after encoding
encoded_columns = ct.get_feature_names_out()

# Calculate feature importance from Random Forest
importances = rf_model.feature_importances_
feature_imp = pd.DataFrame({'feature': encoded_columns, 'importance': importances})
feature_imp = feature_imp.sort_values(by='importance', ascending=False)

feature_imp.head(15)


Unnamed: 0,feature,importance
25,remainder__display_size_inch,0.144882
27,remainder__rating,0.132189
23,remainder__camera_mp,0.083069
26,remainder__charging_watt,0.080907
21,remainder__ram_gb,0.076627
24,remainder__battery_mah,0.075076
22,remainder__storage_gb,0.075057
20,onehot__processor_Tensor G4,0.020494
7,onehot__brand_Vivo,0.019851
3,onehot__brand_OnePlus,0.019163


In [None]:
# Export feature importance table for Tableau
feature_imp.to_csv("feature_importance.csv", index=False)
"Feature importance exported."


'Feature importance exported.'