In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression

# Load your data
data = pd.read_csv('data_creation_cleaned_no_duplicates.csv')

# Convert 'Date' column to datetime type
data['Date'] = pd.to_datetime(data['Date'])

# Check if 'Earnings' column is of numeric type
if not pd.api.types.is_numeric_dtype(data['Earnings']):
    # Convert 'Earnings' column to numeric (removing non-numeric characters)
    data['Earnings'] = pd.to_numeric(data['Earnings'].str.replace('[^\d.]', ''), errors='coerce')

# Check for missing values after 'Earnings' conversion
if data['Earnings'].isnull().sum() > 0:
    print('Earnings column contains missing values.')
    data['Earnings'] = data['Earnings'].fillna(data['Earnings'].mean())  # Fill missing values with column mean

# Check for infinite values in numeric columns only
numeric_cols = data.select_dtypes(include=[np.number]).columns
if np.isinf(data[numeric_cols]).sum().sum() > 0:
    print('Data contains infinite values.')
    data[numeric_cols] = data[numeric_cols].replace([np.inf, -np.inf], np.nan)  # Replace inf with NaN in numeric columns only
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())  # Fill NaN values with column mean in numeric columns only

# Apply Min-Max scaling
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data.drop(columns=['Date', 'Game', 'ReleaseDate', 'Genre', 'Release_Decade']))

# Convert scaled data back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=data.columns.drop(['Date', 'Game', 'ReleaseDate', 'Genre', 'Release_Decade']))

# Perform feature selection
selector = SelectKBest(score_func=f_regression, k=5)  # Select top 5 features
selected_data = selector.fit_transform(scaled_df, data['TotalEarnings'])

# Get the selected feature names
selected_features = scaled_df.columns[selector.get_support(indices=True)]
print(f'Selected Features: {selected_features}')

# Convert selected data back to DataFrame
selected_df = pd.DataFrame(selected_data, columns=selected_features)

# Add 'Game' and 'Genre' columns back to the DataFrame
selected_df['Game'] = data['Game']
selected_df['Genre'] = data['Genre']

# Reorder the columns
cols = ['Game', 'Genre'] + [col for col in selected_df.columns if col not in ['Game', 'Genre']]
selected_df = selected_df[cols]

# Display the DataFrame
print(selected_df)

# Save the DataFrame to a CSV file
selected_df.to_csv('engineer_selected_data_with_game_and_genre.csv', index=False)


Selected Features: Index(['Earnings', 'Players', 'Tournaments', 'TotalPlayers',
       'TotalTournaments'],
      dtype='object')
                                      Game                 Genre  Earnings  \
0           Command and Conquer: Red Alert              Strategy  0.000374   
1                               QuakeWorld  First-Person Shooter  0.000374   
2                                 Quake II  First-Person Shooter  0.000374   
3                       Total Annihilation              Strategy  0.000374   
4                               QuakeWorld  First-Person Shooter  0.000007   
...                                    ...                   ...       ...   
9239                                  osu!   Music / Rhythm Game  0.000031   
9240                     Trackmania (2020)                Racing  0.000022   
9241                        Age of Empires              Strategy  0.000015   
9242                     Age of Empires II              Strategy  0.000012   
9243  PLAYER