In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the model and preprocessing tools
model = tf.keras.models.load_model('video_game_sales_model.h5')
scaler = joblib.load('scaler.pkl')
label_encoders = {col: joblib.load(f'label_encoder_{col}.pkl') for col in ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'Developer', 'Rating']}
tokenizer = joblib.load('tokenizer.pkl')

# Load test data
X_test = pd.read_csv('X_test.csv')
game_names = X_test['Name'].copy()  # Assuming the column name is 'Name'
X_test.drop(columns=['Name'], inplace=True)  # Remove the name column for prediction

# Define max_length (ensure this is the same as used during training)
max_length = 100  
# Ensure text data is read as string
X_text_test = X_text_test.astype(str)

# Preprocess text data
text_sequences = tokenizer.texts_to_sequences(X_text_test['0'])
text_data_test = pad_sequences(text_sequences, maxlen=max_length)

# Preprocess numerical data
numerical_columns = ['Critic_Score', 'Critic_Count', 'User_Score', 'User_Count']
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Placeholder value for unseen categories
placeholder_value = "Unknown"

# Preprocess categorical data
for col in ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'Developer', 'Rating']:
    # Replace unseen categories in test set with placeholder value
    unseen_categories = set(X_test[col]) - set(label_encoders[col].classes_)
    if unseen_categories:
        X_test[col].replace(list(unseen_categories), placeholder_value, inplace=True)
    
    # Update the encoder classes to include placeholder
    label_encoders[col].classes_ = np.append(label_encoders[col].classes_, placeholder_value)
    
    # Transform using label encoder
    X_test[col] = label_encoders[col].transform(X_test[col])


# Preprocess text data
text_sequences = tokenizer.texts_to_sequences(X_text_test['0'])  # Assuming '0' is the column name in X_text_test.csv
text_data_test = pad_sequences(text_sequences, maxlen=tokenizer.num_words)

# Prepare inputs for prediction
test_inputs = [X_test[numerical_columns].values] + [X_test[col].values for col in ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'Developer', 'Rating']] + [text_data_test]

# Make predictions
predictions = model.predict(test_inputs)

# Create a DataFrame with predictions and game names
predicted_sales = pd.DataFrame(predictions, columns=['NA_Sales_Predicted', 'JP_Sales_Predicted', 'EU_Sales_Predicted', 'Other_Sales_Predicted'])
predicted_sales['Game_Name'] = game_names

# Reorder columns to place 'Game_Name' first
predicted_sales = predicted_sales[['Game_Name', 'NA_Sales_Predicted', 'JP_Sales_Predicted', 'EU_Sales_Predicted', 'Other_Sales_Predicted']]

# Save to CSV
predicted_sales.to_csv('game_sales_predictions_with_names.csv', index=False)
print("Predictions with game names saved to 'game_sales_predictions_with_names.csv'.")



KeyError: 'Name'