In [65]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler

In [66]:
df = pd.read_csv('../data/filtered_reviews.csv')

display(df.head())

display(df.dtypes)

Unnamed: 0,game,author_num_games_owned,author_num_reviews,author_playtime_at_review,language,review,voted_up,votes_funny,weighted_vote_score,steam_purchase,received_for_free,written_during_early_access
0,Counter-Strike,41.0,10.0,221.0,english,cs 1.6>cs2,1,13.0,0.863652,1,0,0
1,Counter-Strike,0.0,1.0,396410.0,english,#1 PLAYED PC GAME ONLINE SINCE 1999,1,1.0,0.865922,1,0,0
2,Counter-Strike,50.0,50.0,81.0,english,"best played with a membrane keyboard, a roller...",1,31.0,0.935513,1,0,0
3,Counter-Strike,11.0,5.0,78.0,english,My friend who I play counter-strike with said ...,1,19.0,0.87565,1,0,0
4,Counter-Strike,0.0,27.0,26568.0,english,Counter-Strike won't ever be canceled. Counter...,1,2.0,0.877432,1,0,0


game                            object
author_num_games_owned         float64
author_num_reviews             float64
author_playtime_at_review      float64
language                        object
review                          object
voted_up                         int64
votes_funny                    float64
weighted_vote_score            float64
steam_purchase                   int64
received_for_free                int64
written_during_early_access      int64
dtype: object

In [67]:
# Removing duplicates and NaNs
duplicate_count = df.duplicated().sum()
print(f"Found {duplicate_count} duplicate rows.")

if duplicate_count > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates removed.")

duplicate_count = df.duplicated().sum()
print(f"\nFound {duplicate_count} duplicate rows.")


print("\nMissing values per column:")
print(df.isnull().sum())


df.dropna(inplace=True)

print("Rows with NaN values removed.")

print("\nMissing values per column:")
print(df.isnull().sum())

Found 119 duplicate rows.
Duplicates removed.

Found 0 duplicate rows.

Missing values per column:
game                           12
author_num_games_owned          0
author_num_reviews              0
author_playtime_at_review       0
language                        0
review                          8
voted_up                        0
votes_funny                     0
weighted_vote_score             0
steam_purchase                  0
received_for_free               0
written_during_early_access     0
dtype: int64
Rows with NaN values removed.

Missing values per column:
game                           0
author_num_games_owned         0
author_num_reviews             0
author_playtime_at_review      0
language                       0
review                         0
voted_up                       0
votes_funny                    0
weighted_vote_score            0
steam_purchase                 0
received_for_free              0
written_during_early_access    0
dtype: int64


In [68]:
# Dropping columbns because they dont really matter
df.drop(columns=['game', 'language'], inplace=True)

print(f"\nDropped unnecessary columns") 
display(df.dtypes)


Dropped unnecessary columns


author_num_games_owned         float64
author_num_reviews             float64
author_playtime_at_review      float64
review                          object
voted_up                         int64
votes_funny                    float64
weighted_vote_score            float64
steam_purchase                   int64
received_for_free                int64
written_during_early_access      int64
dtype: object

In [69]:
# Normalizing numeric columns 

# Identify numeric columns to scale
numeric_cols = [
    'author_num_games_owned', 
    'author_num_reviews', 
    'author_playtime_at_review', 
    'votes_funny', 
    'weighted_vote_score'
]


for col in numeric_cols:   
    if col in ['author_playtime_at_review', 'votes_funny', 'author_num_games_owned', 'author_num_reviews']:
        df[col] = np.log1p(df[col])


# Now, initialize and apply the StandardScaler as before
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("Numeric features log-transformed and scaled.")

print(df[numeric_cols].describe())

display(df.head())

Numeric features log-transformed and scaled.
       author_num_games_owned  author_num_reviews  author_playtime_at_review  \
count            1.930350e+05        1.930350e+05               1.930350e+05   
mean            -8.009641e-17        3.262751e-16               1.319235e-16   
std              1.000003e+00        1.000003e+00               1.000003e+00   
min             -1.020995e+00       -1.590167e+00              -2.217488e+00   
25%             -1.020995e+00       -7.764781e-01              -5.136467e-01   
50%              2.705615e-01       -9.782557e-02               8.780575e-02   
75%              9.045212e-01        6.469161e-01               6.259403e-01   
max              2.404858e+00        4.748153e+00               3.265479e+00   

        votes_funny  weighted_vote_score  
count  1.930350e+05         1.930350e+05  
mean  -2.709143e-17         1.225004e-15  
std    1.000003e+00         1.000003e+00  
min   -1.543637e+00        -1.388832e+00  
25%   -7.410606e-01

Unnamed: 0,author_num_games_owned,author_num_reviews,author_playtime_at_review,review,voted_up,votes_funny,weighted_vote_score,steam_purchase,received_for_free,written_during_early_access
0,0.219404,-0.327542,-0.291339,cs 1.6>cs2,1,-0.015791,0.12285,1,0,0
1,-1.020995,-1.590167,2.378096,#1 PLAYED PC GAME ONLINE SINCE 1999,1,-1.142349,0.176756,1,0,0
2,0.283838,0.808566,-0.646415,"best played with a membrane keyboard, a roller...",1,0.462803,1.829478,1,0,0
3,-0.196343,-0.776478,-0.659703,My friend who I play counter-strike with said ...,1,0.190701,0.407797,1,0,0
4,-1.020995,0.364456,1.414534,Counter-Strike won't ever be canceled. Counter...,1,-0.90761,0.4501,1,0,0


In [70]:
df.to_csv('../data/cleaned_reviews.csv', index=False)

print("Cleaned data saved to '../data/cleaned_reviews.csv'")
display(df.info)
display(df.dtypes)

Cleaned data saved to '../data/cleaned_reviews.csv'


<bound method DataFrame.info of         author_num_games_owned  author_num_reviews  author_playtime_at_review  \
0                     0.219404           -0.327542                  -0.291339   
1                    -1.020995           -1.590167                   2.378096   
2                     0.283838            0.808566                  -0.646415   
3                    -0.196343           -0.776478                  -0.659703   
4                    -1.020995            0.364456                   1.414534   
...                        ...                 ...                        ...   
193169                0.972299            1.212600                  -0.278718   
193170                0.978775            0.988220                  -1.115477   
193171               -1.020995           -0.050025                  -0.561681   
193172                0.493749           -0.476170                  -1.476130   
193173                1.650217            2.045672                  -0.993211

author_num_games_owned         float64
author_num_reviews             float64
author_playtime_at_review      float64
review                          object
voted_up                         int64
votes_funny                    float64
weighted_vote_score            float64
steam_purchase                   int64
received_for_free                int64
written_during_early_access      int64
dtype: object