In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

# 1. Create the dataset from the provided table
data = {
    'Year': [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014],
    'No_of_iPhone_Users': [1360000000, 1230000000, 1000000000, 948000000, 888000000, 814000000, 710000000, 569000000, 442000000],
    'No_of_iPhone_Users_USA': [149000000, 141000000, 138000000, 131000000, 127000000, 125000000, 114000000, 101000000, 88000000],
    'Percentage_of_iPhone_Users': [48.7, 46.9, 45.3, 45.2, 45.1, 44.2, 43.5, 43.3, 42.3],
    'No_of_iPhone_Sold': [226400000, 235700000, 206100000, 191000000, 208800000, 215800000, 215400000, 231500000, 192700000],
    'No_of_iPhone_Sold_USA': [124700000.0, 116300000.0, 113500000.0, 105200000.0, 101900000.0, 97200000.0, 90100000.0, 82500000.0, 72300000.0],
    'iOS_Market_Share': [56.74, 58.58, 59.54, 55.23, 54.82, 53.89, 53.19, 50.85, 52.30],
    'Android_Market_Share': [42.94, 41.11, 40.20, 44.51, 44.73, 45.23, 45.20, 46.42, 42.58]
}
df = pd.DataFrame(data)

# 2. Preprocessing
df['YoY_User_Growth_Global'] = df['No_of_iPhone_Users'].pct_change() * 100
df['YoY_User_Growth_USA'] = df['No_of_iPhone_Users_USA'].pct_change() * 100
df['USA_User_Share'] = (df['No_of_iPhone_Users_USA'] / df['No_of_iPhone_Users']) * 100
df['Market_Share_Difference'] = df['iOS_Market_Share'] - df['Android_Market_Share']
df.dropna(inplace=True)

# Features and target
X = df.drop(columns=['Year', 'No_of_iPhone_Sold'])
y = df['No_of_iPhone_Sold']

# Scaling
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

# Train-test split (chronological)
split_year_train_end = 2020
X_train = X[df['Year'] < split_year_train_end]
y_train = y[df['Year'] < split_year_train_end]
X_test = X[df['Year'] >= split_year_train_end]
y_test = y[df['Year'] >= split_year_train_end]

# --- Metrics Function ---
def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return {"Model": name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2, "MAPE": mape}

# --- Model Training and Evaluation ---
results = []

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
results.append(evaluate_model("Linear Regression", y_test, lr_preds))

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
results.append(evaluate_model("Random Forest", y_test, rf_preds))

# XGBoost (Initial)
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
results.append(evaluate_model("XGBoost (Initial)", y_test, xgb_preds))

# --- Comparison Table (Initial Models) ---
results_df_initial = pd.DataFrame(results)
print("\nInitial Model Comparison Table:")
print(results_df_initial)

# --- Identify Best Model ---
best_model_name = results_df_initial.sort_values(by="RMSE").iloc[0]['Model']
print(f"\nBest Performing Model (before tuning): {best_model_name}")

# --- 3. Hyperparameter Tuning for XGBoost (Best Model) ---
# Key hyperparameters to tune for XGBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# GridSearchCV
xgb_grid = GridSearchCV(estimator=XGBRegressor(random_state=42), param_grid=param_grid, 
                        scoring='neg_mean_squared_error', cv=KFold(n_splits=5), n_jobs=-1, verbose=1)
xgb_grid.fit(X, y)

# Get the best parameters and best estimator
best_params = xgb_grid.best_params_
best_xgb_model = xgb_grid.best_estimator_

print(f"\nBest Hyperparameters found by GridSearchCV: {best_params}")

# --- 4. Before vs After Comparison ---
# Initial model metrics
initial_params = {'n_estimators': 100, 'random_state': 42}
initial_metrics = results_df_initial[results_df_initial['Model'] == 'XGBoost (Initial)'].drop(columns='Model').iloc[0]

# Tuned model metrics
tuned_preds = best_xgb_model.predict(X_test)
tuned_metrics = evaluate_model("XGBoost (Tuned)", y_test, tuned_preds)
tuned_metrics_series = pd.Series(tuned_metrics).drop('Model')

comparison_df = pd.DataFrame({
    'Initial XGBoost': initial_metrics,
    'Tuned XGBoost': tuned_metrics_series
}).T
comparison_df.index.name = 'Model Version'
print("\nBefore vs After Tuning Comparison:")
print(comparison_df)

# Calculate improvement
improvement_df = pd.DataFrame({
    'Metric': comparison_df.columns,
    'Improvement (%)': ((comparison_df.loc['Initial XGBoost'] - comparison_df.loc['Tuned XGBoost']) / comparison_df.loc['Initial XGBoost']).round(2) * 100
})
print("\nPercentage Improvement (lower is better for error metrics, higher for R2):")
print(improvement_df)

# Visualization of the improvement
metrics_to_plot = ["MAE", "RMSE", "MAPE", "R2"]
improvement_plot_data = pd.DataFrame({
    'Metric': metrics_to_plot,
    'Initial': [comparison_df.loc['Initial XGBoost'][metric] for metric in metrics_to_plot],
    'Tuned': [comparison_df.loc['Tuned XGBoost'][metric] for metric in metrics_to_plot]
})
improvement_plot_data.set_index('Metric', inplace=True)

improvement_plot_data.plot(kind='bar', figsize=(12, 8))
plt.title('XGBoost Performance: Before vs. After Tuning')
plt.ylabel('Metric Value')
plt.xticks(rotation=0)
plt.legend(title='Model Version')
plt.grid(axis='y', linestyle='--')
plt.show()

# --- 5. Cross-Validation of Tuned Model ---
# Perform k-fold cross-validation on the tuned model
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Scoring metrics
scoring = {'MAE': 'neg_mean_absolute_error',
           'RMSE': 'neg_mean_squared_error',
           'R2': 'r2'}

cv_scores = cross_val_score(best_xgb_model, X, y, cv=kf, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)

cv_mae_scores = cross_val_score(best_xgb_model, X, y, cv=kf, scoring='neg_mean_absolute_error')
cv_mae = -cv_mae_scores

cv_r2_scores = cross_val_score(best_xgb_model, X, y, cv=kf, scoring='r2')
cv_r2 = cv_r2_scores

print(f"\nCross-Validation Results (k={k}):")
print(f"Mean MAE: {np.mean(cv_mae):.2f} (Std: {np.std(cv_mae):.2f})")
print(f"Mean RMSE: {np.mean(cv_rmse):.2f} (Std: {np.std(cv_rmse):.2f})")
print(f"Mean R2: {np.mean(cv_r2):.2f} (Std: {np.std(cv_r2):.2f})")



Initial Model Comparison Table:
               Model           MAE           MSE          RMSE         R2  \
0  Linear Regression  1.210115e+08  1.465765e+16  1.210688e+08 -65.917692   
1      Random Forest  2.344650e+07  7.735549e+14  2.781286e+07  -2.531569   
2  XGBoost (Initial)  2.989995e+07  1.113047e+15  3.336236e+07  -4.081479   

        MAPE  
0  54.914665  
1  10.206143  
2  13.145641  

Best Performing Model (before tuning): Random Forest


ModuleNotFoundError: No module named '_posixsubprocess'

In [2]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# --- Hyperparameter Tuning (Random Forest example) ---
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Best tuned model
best_rf = grid_search.best_estimator_

# Predictions with tuned model
tuned_preds = best_rf.predict(X_test)

# Evaluate tuned model
tuned_results = evaluate_model("Random Forest (Tuned)", y_test, tuned_preds)

# --- Comparison: Before vs After ---
comparison_df = results_df.append(tuned_results, ignore_index=True)
print("\nComparison Before vs After Tuning:")
print(comparison_df)

# --- Visualization of Improvement ---
metrics_to_plot = ["MAE", "RMSE", "MAPE", "R2"]

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()

for i, metric in enumerate(metrics_to_plot):
    axes[i].bar(comparison_df["Model"], comparison_df[metric], color=['skyblue','lightgreen','salmon','orange'])
    axes[i].set_title(metric)
    axes[i].set_ylabel(metric)
    axes[i].set_xticklabels(comparison_df["Model"], rotation=30)

plt.tight_layout()
plt.show()

# --- Cross Validation on Tuned Model ---
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse = -cv_scores
print(f"\nCross-Validation RMSE (5-fold): Mean = {cv_rmse.mean():.2f}, Std = {cv_rmse.std():.2f}")

print(f"\nBest Parameters: {grid_search.best_params_}")


ModuleNotFoundError: No module named '_posixsubprocess'