In [None]:
#Re-import necessary libraries for the regression analysis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

data=pd.read_csv('https://raw.githubusercontent.com/MJ41739/data/main/50_Startups.csv')


#Split the data into features and target
X = data[['R&D Spend', 'Administration', 'Marketing Spend']]
y = data['Profit']

#Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

estimators=[2,3,4,5,10,20,50,100,150,200]
depth=[i for i in range(2,11)]
#for finding best estimators and depth value
for x in estimators:
   for y in depth:
     random_forest_model = RandomForestRegressor(n_estimators=x,max_depth=y, random_state=42)
     random_forest_model.fit(X_train, y_train)
     y_pred_rf = random_forest_model.predict(X_test)
     metrics ={ 'Random Forest': {
           'n_estimators' : x,'max_depth':y,
           'MSE': mean_squared_error(y_test, y_pred_rf),
           'RMSE': mean_squared_error(y_test, y_pred_rf, squared=False),
           'MAE': mean_absolute_error(y_test, y_pred_rf),
           'R2': r2_score(y_test, y_pred_rf)

       }}
     print(metrics)


#Initialize and train the models
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(n_estimators=5,max_depth=5, random_state=42)
ridge_model = Ridge(alpha=1.0)

linear_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)

#Make predictions on the test set
y_pred_linear = linear_model.predict(X_test)
y_pred_rf = random_forest_model.predict(X_test)
y_pred_ridge = ridge_model.predict(X_test)

#Calculate metrics for each model
metrics = {
    'Linear Regression': {
        'MSE': mean_squared_error(y_test, y_pred_linear),
        'RMSE': mean_squared_error(y_test, y_pred_linear, squared=False),
        'MAE': mean_absolute_error(y_test, y_pred_linear),
        'R2': r2_score(y_test, y_pred_linear)
    },
    'Random Forest': {
        'MSE': mean_squared_error(y_test, y_pred_rf),
        'RMSE': mean_squared_error(y_test, y_pred_rf, squared=False),
        'MAE': mean_absolute_error(y_test, y_pred_rf),
        'R2': r2_score(y_test, y_pred_rf)
    },
    'Ridge Regression': {
        'MSE': mean_squared_error(y_test, y_pred_ridge),
        'RMSE': mean_squared_error(y_test, y_pred_ridge, squared=False),
        'MAE': mean_absolute_error(y_test, y_pred_ridge),
        'R2': r2_score(y_test, y_pred_ridge)
    }
}

metrics




{'Random Forest': {'n_estimators': 2, 'max_depth': 2, 'MSE': 121704312.94393854, 'RMSE': 11031.967772974074, 'MAE': 8904.077318181813, 'R2': 0.8497091822739473}}
{'Random Forest': {'n_estimators': 2, 'max_depth': 3, 'MSE': 119292120.5731701, 'RMSE': 10922.093232213781, 'MAE': 8661.838854166666, 'R2': 0.8526879621967454}}
{'Random Forest': {'n_estimators': 2, 'max_depth': 4, 'MSE': 181577040.59641245, 'RMSE': 13475.052526666175, 'MAE': 10520.705958333327, 'R2': 0.775773255265966}}
{'Random Forest': {'n_estimators': 2, 'max_depth': 5, 'MSE': 104850905.37177745, 'RMSE': 10239.673108638646, 'MAE': 8500.894499999999, 'R2': 0.8705212007161967}}
{'Random Forest': {'n_estimators': 2, 'max_depth': 6, 'MSE': 84611404.7549124, 'RMSE': 9198.44577931035, 'MAE': 7328.344499999997, 'R2': 0.8955146543128393}}
{'Random Forest': {'n_estimators': 2, 'max_depth': 7, 'MSE': 84611404.7549124, 'RMSE': 9198.44577931035, 'MAE': 7328.344499999997, 'R2': 0.8955146543128393}}
{'Random Forest': {'n_estimators': 2,

{'Linear Regression': {'MSE': 80926321.22295158,
  'RMSE': 8995.905803361415,
  'MAE': 6979.152252370402,
  'R2': 0.9000653083037321},
 'Random Forest': {'MSE': 42711577.577849194,
  'RMSE': 6535.409518756203,
  'MAE': 5349.293933333334,
  'R2': 0.9472561180021485},
 'Ridge Regression': {'MSE': 80926321.22368833,
  'RMSE': 8995.905803402364,
  'MAE': 6979.152252428612,
  'R2': 0.9000653083028223}}

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
#scatter plot for predicted values
fig.add_trace(go.Scatter(x=y_test, y=y_pred_rf, mode='markers', name='Predicted',
                         marker=dict(color='red', opacity=0.5)))
#scatter plot for actual values
fig.add_trace(go.Scatter( x=y_test, y=y_test, mode='lines', name='Actual', line=dict(color='black', dash='dash')))

fig.update_layout(
    title='Actual vs. Predicted Profit (Random Forest Regressor)',
    xaxis_title='Actual Profit',
    yaxis_title='Predicted Profit',
    legend_title='Legend',
    width=1000,  # Custom width and height can be set here
    height=600
)

fig.show()


In [None]:
X_input = pd.DataFrame([{'R&D Spend': 25000, 'Administration': 21000, 'Marketing Spend': 14000}])

In [None]:
X_input

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,25000,21000,14000


In [None]:
y_rfor = random_forest_model.predict(X_input)
y_lin = linear_model.predict(X_input)
y_rid = ridge_model.predict(X_input)

In [None]:
y_rfor

array([73271.089])

In [None]:
y_lin

array([73177.22662527])

In [None]:
y_rid

array([73177.22662363])

In [None]:
!pip freeze > requirements.txt
