In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('data_creation_cleaned_no_duplicates.csv')

# Select relevant features for regression
features = ['TotalEarnings', 'TotalPlayers', 'TotalTournaments', 'Earnings_YoY_Growth', 'Players_YoY_Growth', 'Tournaments_YoY_Growth']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df['Earnings'], test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Gradient Boosting Regression
gradient_boost_model = GradientBoostingRegressor()
gradient_boost_model.fit(X_train_scaled, y_train)
gradient_boost_predictions = gradient_boost_model.predict(X_test_scaled)

# Evaluate the model
gradient_boost_rmse = mean_squared_error(y_test, gradient_boost_predictions, squared=False)

# Predict future earnings for top 10 games
future_data = df.tail(10)  # Assuming the last 10 rows are for the top 10 games
future_data_scaled = scaler.transform(future_data[features])
future_earnings_predictions = gradient_boost_model.predict(future_data_scaled)

# Display the results
print(f'Gradient Boosting RMSE: {gradient_boost_rmse}')
print('\nPredicted Future Earnings for Top 10 Games:')
future_data.loc[:, 'Predicted_Earnings'] = future_earnings_predictions
print(future_data[['Game', 'Predicted_Earnings']].sort_values(by='Predicted_Earnings', ascending=False))


Gradient Boosting RMSE: 673839.2156837306

Predicted Future Earnings for Top 10 Games:
                                      Game  Predicted_Earnings
9237                              Fortnite       576652.121012
9235                     Age of Empires IV       305575.554906
9243  PLAYERUNKNOWN'S BATTLEGROUNDS Mobile       276240.287664
9234                             CrossFire       119999.833739
9236                          StarCraft II       101573.065322
9238                       Team Fortress 2         6311.638589
9241                        Age of Empires       -14306.974836
9240                     Trackmania (2020)       -18592.785894
9239                                  osu!       -20894.574812
9242                     Age of Empires II       -21261.544974


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data.loc[:, 'Predicted_Earnings'] = future_earnings_predictions
