# Exploratory Data Analysis for Steamit

### Importing Necessary Libraries

In [3]:
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from sklearn.decomposition import PCA




# Connect to the database
conn = sqlite3.connect('steam_games.db')

# Create a cursor object
cursor = conn.cursor()

# List all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Close the connection
conn.close()

# Print the list of tables
print("Tables in the database:", tables)


Tables in the database: [('game_details',), ('game_reviews',), ('sqlite_sequence',)]


In [4]:
# Correct table name based on your database structure
game_table = 'game_details'  # Replace with the actual table name if different
review_table = 'game_reviews'  # Replace with the actual table name if different

# Reconnect to the database
conn = sqlite3.connect('steam_games.db')

# Load the data into pandas DataFrames
games_df = pd.read_sql_query(f"SELECT * FROM {game_table}", conn)
reviews_df = pd.read_sql_query(f"SELECT * FROM {review_table}", conn)

# Close the connection
conn.close()

# Display the first few rows of the dataframes
games_df.head()



Unnamed: 0,appid,name,description,price,release_date,developer,publisher,tags
0,1462940,Ecto Portal,"The next generation of spooky displays, create...",₹ 529,"16 Jan, 2021",SpookyAction,SpookyAction,"Action, Adventure"
1,1462960,Evade Zero,Evade is a fast and intense local multiplayer ...,₹ 299,"18 Dec, 2020",Jonathan Francis,Self Published,"Action, Indie, Sports"
2,1462970,Jumpman (C64/MSDOS),"Run, climb and jump from platform to platform ...",₹ 155,"19 Nov, 2020","Epyx, Inc.",Pixel Games UK,"Action, Adventure, Simulation"
3,1462980,Dinosaurs Live Wallpaper: with Secret Sandboxes,,,Coming soon,William at Oxford,William at Oxford,Utilities
4,1462990,Rushaug: Feline Warfare,A tactical platformer combining stealth and fa...,,To be announced,Tobias Edvardsen,Tobias Edvardsen,"Action, Adventure"


In [5]:
# Remove DLCs, Playtests, and Demos from games_df
filtered_games_df = games_df[~games_df['name'].str.contains('soundtrack|OST|demo|DLC|playtest', case=False, na=False)]
filtered_games_df.to_csv('filtered_games_df.csv', index=False)
print(filtered_games_df.head(), filtered_games_df.shape)

     appid                                             name  \
0  1462940                                      Ecto Portal   
1  1462960                                       Evade Zero   
2  1462970                              Jumpman (C64/MSDOS)   
3  1462980  Dinosaurs Live Wallpaper: with Secret Sandboxes   
4  1462990                          Rushaug: Feline Warfare   

                                         description  price     release_date  \
0  The next generation of spooky displays, create...  ₹ 529     16 Jan, 2021   
1  Evade is a fast and intense local multiplayer ...  ₹ 299     18 Dec, 2020   
2  Run, climb and jump from platform to platform ...  ₹ 155     19 Nov, 2020   
3                                                       N/A      Coming soon   
4  A tactical platformer combining stealth and fa...    N/A  To be announced   

           developer          publisher                           tags  
0       SpookyAction       SpookyAction              Action, Advent

In [6]:
# Filter reviews based on the filtered games_df appid
filtered_reviews_df = reviews_df[reviews_df['appid'].isin(filtered_games_df['appid'])]
print("Filtered reviews_df shape:", filtered_reviews_df.shape)

Filtered reviews_df shape: (7710, 8)


In [7]:
# Load pre-trained BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get text embedding
def get_embedding(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy()

# Generate embeddings for all game descriptions
embeddings = []
for description in filtered_games_df['description']:
    embeddings.append(get_embedding(description).flatten())

# Convert to numpy array and save
bert_item_feature_matrix = np.array(embeddings)
np.save('bert_item_feature_matrix.npy', bert_item_feature_matrix)
print("Item feature matrix has been successfully saved as 'bert_item_feature_matrix.npy")



Item feature matrix has been successfully saved as 'bert_item_feature_matrix.npy


# Perform PCA on BERT Embeddings

In [8]:
bert_item_feature_matrix.shape

(1531, 768)

In [9]:

# Assuming item_feature_matrix is already loaded
pca_3d = PCA(n_components=768)
reduced_item_feature_matrix = pca_3d.fit_transform(bert_item_feature_matrix)


In [10]:
reduced_item_feature_matrix.shape

(1531, 768)

In [11]:
np.save('reduced_item_feature_matrix.npy', reduced_item_feature_matrix)