In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'data_creation_cleaned_no_duplicates.csv'
df = pd.read_csv(file_path)

# Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, columns=['Genre'])

# Convert 'ReleaseDate' to datetime and extract features
df['ReleaseDate'] = pd.to_datetime(df['ReleaseDate'])
df['ReleaseYear'] = df['ReleaseDate'].dt.year

# Drop unnecessary columns for training
X = df.drop(['Earnings', 'Date', 'Game', 'ReleaseDate'], axis=1)
y = df['Earnings']

# Initialize the LabelEncoder for 'ReleaseYear' if needed
le = LabelEncoder()
X['ReleaseYear'] = le.fit_transform(X['ReleaseYear'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Regressor
regressor = GradientBoostingRegressor()

# Fit the model
regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model 
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Save the predictions along with other columns to a new CSV file
df['Predicted_Earnings'] = regressor.predict(X)
df.to_csv('predicted_results.csv', index=False)


Mean Squared Error: 293081075219.4216
