In [2]:
import pandas as pd

# Quickly checking columns
df = pd.read_csv(r'data\unprocessed\steam_reviews.csv', nrows=0)
print(df.columns)

Index(['Unnamed: 0', 'app_id', 'app_name', 'review_id', 'language', 'review',
       'timestamp_created', 'timestamp_updated', 'recommended',
       'votes_helpful', 'votes_funny', 'weighted_vote_score', 'comment_count',
       'steam_purchase', 'received_for_free', 'written_during_early_access',
       'author.steamid', 'author.num_games_owned', 'author.num_reviews',
       'author.playtime_forever', 'author.playtime_last_two_weeks',
       'author.playtime_at_review', 'author.last_played'],
      dtype='object')


In [10]:
# Columns to keep. Will be needed during processing
keep = ['review','recommended','author.playtime_at_review','author.num_games_owned','author.num_reviews']

#  Column filtering example
df_filtered = df[keep]
print(df_filtered.columns)

Index(['review', 'recommended', 'author.playtime_at_review',
       'author.num_games_owned', 'author.num_reviews'],
      dtype='object')


author.playtime_at_review is in minutes. Compare: https://store.steampowered.com/appreviews/784080?json=1&language=all&userid=76561198020436202, with https://steamcommunity.com/profiles/76561198020436202/recommended/784080/

In [11]:
# Checking review languages
df = pd.read_csv(r'data\unprocessed\steam_reviews.csv', nrows=10000)
print(df.language.unique())

['schinese' 'english' 'turkish' 'spanish' 'russian' 'koreana' 'latam'
 'brazilian' 'portuguese' 'vietnamese' 'polish' 'french' 'german'
 'hungarian' 'ukrainian' 'tchinese' 'bulgarian' 'czech' 'italian' 'thai'
 'greek' 'dutch' 'finnish' 'romanian' 'japanese' 'swedish' 'danish'
 'norwegian']


In [12]:
# Filtering non-english reviews
df_english = df[df['language'].str.contains("english")].copy()
print(df_english.language.unique())

['english']


In [14]:
# read 21mil row (8gb) CSV as chunks
chunks = pd.read_csv(r'data\unprocessed\steam_reviews.csv', chunksize = 400000)
print("Loaded chunks")

# Column list already declared but incase we forgot. Rename columns list for easier use later.
keep = ['review','recommended','author.playtime_at_review','author.num_games_owned','author.num_reviews']
new_names = ['review','sentiment','playtime','num_games','num_reviews']

# Define new column order with target (sentiment) last
columns_ordered = ['review','playtime','num_games','num_reviews','sentiment']

# List to append chunks to after proecssing
chunklist=[]

# Iterating through chunks
for chunk in chunks:
        
        # Flter non-english rows then drop unnecessary columns, rename remaining columns, and reorder them
        chunk = chunk[chunk['language'].str.contains("english")]
        chunk = chunk[keep]
        chunk.set_axis(new_names, axis = 1, inplace=True)
        chunk = chunk[columns_ordered]

        # Drop rows with empty cells
        df.dropna(inplace=True)

        # Convert reviews to lower case, keep only alphabet, and change new-lines to spaces
        chunk.review = chunk.review.str.lower()
        chunk.review = chunk.review.str.replace('[^a-z\s]','',regex=True)
        chunk.review = chunk.review.str.replace('\n',' ')

        # Keep only rows where review  contain letter (drop ones made out of spaces)
        chunk = chunk[chunk['review'].str.contains('[a-z]', na=False)]

        # Convert playtime from minutes to hours for intuitive understanding 
        chunk['playtime'] = round(chunk['playtime'] / 60, 2)
        
        chunklist.append(chunk)

# Merge chunks into one dataframe
df = pd.concat(chunklist, ignore_index=True)

# Write dataframe to new CSV
df.to_csv(r'data\processed\reviews_processed.csv', index=False)

Loaded chunks


  chunk.review = chunk.review.str.replace('[^a-z\s]','')


In [None]:
test = pd.read_csv(r'data\processed\reviews_processed.csv', nrows=1000)
print(test.head())

                                              review  playtime  num_games  \
0  one of the best rpgs of all time worthy of any...     92.07          5   
1                good story good graphics lots to do     13.72         11   
2                                            dis gud     69.87         27   
3  favorite game of all time cant wait for the ne...    388.82         33   
4                           why wouldnt you get this    142.62        131   

   num_reviews  sentiment  
0            3       True  
1            1       True  
2            2       True  
3            1       True  
4            2       True  
