In [1]:
import pandas as pd
import numpy as np
import ast
import os
from datetime import datetime
import re
import string
import nltk
from nltk.corpus import stopwords, opinion_lexicon
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('opinion_lexicon')
# Load the partial data
df = pd.read_csv('./data/raw/steam_games.csv')

ModuleNotFoundError: No module named 'nltk'

In [21]:
# Get a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20497 entries, 0 to 20496
Data columns (total 43 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   type                     20497 non-null  object
 1   name                     20496 non-null  object
 2   steam_appid              20497 non-null  int64 
 3   required_age             20497 non-null  object
 4   is_free                  20497 non-null  bool  
 5   detailed_description     18023 non-null  object
 6   about_the_game           18018 non-null  object
 7   short_description        18057 non-null  object
 8   fullgame                 7594 non-null   object
 9   supported_languages      18215 non-null  object
 10  header_image             20497 non-null  object
 11  capsule_image            20497 non-null  object
 12  capsule_imagev5          20497 non-null  object
 13  website                  6364 non-null   object
 14  pc_requirements          20497 non-nul

In [22]:
# Define regex patterns for DLCs and Playtests
patterns = ['DLC', 'Playtest', 'Early Access', 'Sex']

# Use regex to filter out rows where 'name' or 'type' contains any of the patterns
df_cleaned = df[~df['name'].str.contains('|'.join(patterns), case=False, na=False)]
df_cleaned = df_cleaned[~df_cleaned['type'].str.contains('|'.join(patterns), case=False, na=False)]

# Check for missing values
missing_values = df_cleaned.isnull().sum()
print(df_cleaned.columns)

# Drop columns with more than 2000 missing values
columns_to_drop = ['type', 'steam_appid', 'required_age', 'is_free', 'about_the_game','supported_languages', 'header_image', 'capsule_image', 
                   'capsule_imagev5', 'website', 'pc_requirements', 'mac_requirements','linux_requirements', 'developers', 'package_groups', 
                   'platforms', 'categories', 'screenshots', 'movies', 'support_info', 'background', 'background_raw', 'legal_notice', 'fullgame', 'demos', 
                   'controller_support', 'reviews', 'dlc', 'achievements','price_overview', 'packages', 'ext_user_account_notice', 'metacritic', 
                   'recommendations', 'drm_notice']
df_cleaned = df_cleaned.drop(columns=columns_to_drop)

# Display the columns that were dropped
print("Dropped columns:")
print(columns_to_drop)

# Fill missing values using .loc to avoid the SettingWithCopyWarning
df_cleaned.loc[:, 'detailed_description'] = df_cleaned['detailed_description'].fillna('')
df_cleaned.loc[:, 'short_description'] = df_cleaned['short_description'].fillna('')

# Select relevant features, including 'metacritic'
columns_to_keep = ['name', 'detailed_description', 'short_description', 'content_descriptors', 'genres', 'release_date', 'ratings', 'publishers']
df_selected = df_cleaned[columns_to_keep]


Index(['type', 'name', 'steam_appid', 'required_age', 'is_free',
       'detailed_description', 'about_the_game', 'short_description',
       'fullgame', 'supported_languages', 'header_image', 'capsule_image',
       'capsule_imagev5', 'website', 'pc_requirements', 'mac_requirements',
       'linux_requirements', 'developers', 'publishers', 'package_groups',
       'platforms', 'categories', 'genres', 'screenshots', 'release_date',
       'support_info', 'background', 'background_raw', 'content_descriptors',
       'ratings', 'movies', 'controller_support', 'legal_notice', 'reviews',
       'dlc', 'price_overview', 'packages', 'demos', 'achievements',
       'recommendations', 'drm_notice', 'ext_user_account_notice',
       'metacritic'],
      dtype='object')
Dropped columns:
['type', 'steam_appid', 'required_age', 'is_free', 'about_the_game', 'supported_languages', 'header_image', 'capsule_image', 'capsule_imagev5', 'website', 'pc_requirements', 'mac_requirements', 'linux_requirement

In [11]:
df_selected["genres"][1]

"[{'id': '1', 'description': 'Action'}, {'id': '28', 'description': 'Simulation'}, {'id': '2', 'description': 'Strategy'}]"