In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import numpy as np

In [2]:
df = pd.read_csv('../data/steam.csv')

# Preprocessing & Feature Engineering

In [3]:
def owners_to_num(x):
    low, high = x.split("-")
    return (int(low) + int(high)) / 2

df["owners_num"] = df["owners"].apply(owners_to_num)

In [4]:
df_processed = df.drop(columns=['appid', 'name', 'release_date', 'developer', 'publisher', 'platforms', 'owners'], errors='ignore')

In [None]:
text_cols = ['categories', 'genres', 'steamspy_tags']
df_final = df_processed.copy()

for col in text_cols:
    if col in df_final.columns:

        df_final[col] = df_final[col].fillna('').astype(str).str.split(';')
        mlb = MultiLabelBinarizer()

        encoded_cols = pd.DataFrame(
            mlb.fit_transform(df_final[col]),
            columns=[f"{col}_{c}" for c in mlb.classes_],
            index=df_final.index
        )
        df_final = pd.concat([df_final, encoded_cols], axis=1)
        df_final.drop(columns=[col], inplace=True)

In [7]:
df_final['english'] = df_final['english'].astype(int)

In [None]:
numerical_features = [
    'required_age', 'achievements', 'positive_ratings', 'negative_ratings',
    'average_playtime', 'median_playtime', 'owners_num', 'price'
]

In [9]:
for col in ['positive_ratings', 'negative_ratings', 'owners_num', 'average_playtime', 'median_playtime']:
    if col in df_final.columns:
        df_final[col + '_log'] = np.log1p(df_final[col])
        numerical_features.remove(col)
        numerical_features.append(col + '_log')

In [10]:
df_final.drop(columns=['positive_ratings', 'negative_ratings', 'owners_num', 'average_playtime', 'median_playtime'], errors='ignore', inplace=True)

# Scaling Numerical Features

In [11]:
scaler = StandardScaler()

final_numerical_cols = [col for col in numerical_features if col in df_final.columns]

df_final[final_numerical_cols] = scaler.fit_transform(df_final[final_numerical_cols])

# Linear Regression 

In [12]:
X_reg = df_final[final_numerical_cols].drop(columns=['average_playtime_log'], errors='ignore')
y_reg = df_final['average_playtime_log']

In [13]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

In [14]:
linear_model = LinearRegression()
linear_model.fit(X_train_reg, y_train_reg)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [16]:
y_pred_reg = linear_model.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2_reg = linear_model.score(X_test_reg, y_test_reg)

print(f"MSE (Log Playtime): {mse:.4f}")
print(f"R-squared: {r2_reg:.4f}")

MSE (Log Playtime): 0.0086
R-squared: 0.9918


# Decision Tree Classifier 

In [17]:
playtime_original = np.expm1(df_final['average_playtime_log'])
addictive_threshold = playtime_original.quantile(0.75)

In [18]:
df_final['is_addictive'] = (playtime_original > addictive_threshold).astype(int)

In [19]:
X_cls = df_final.drop(columns=['average_playtime_log', 'is_addictive', 'median_playtime_log', 'english', 'price', 'required_age'], errors='ignore')
y_cls = df_final['is_addictive']

In [20]:
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.3, random_state=42, stratify=y_cls)

In [21]:
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model.fit(X_train_cls, y_train_cls)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [22]:
y_pred_cls = dt_model.predict(X_test_cls)
accuracy = accuracy_score(y_test_cls, y_pred_cls)

print(f"(Accuracy): {accuracy:.4f}")

(Accuracy): 0.8909
