# Importing Libraries, defining paths & creating functions

In [1]:
# Importing libraries and paths

import pandas as pd
import gzip
import ast

# Creating a function to read the paths

def read_path(file):
    with gzip.open(file, 'rt', encoding='utf-8') as myfile:
        return [ast.literal_eval(line.strip()) for line in myfile]
    
# Creating a function to change datetime in columns
    
def datetime_change(var):

    if pd.isna(var):
        return None

    try:
        return pd.to_datetime(var)
    except ValueError:
        return None
    

# Creating a function to unnest data in columns
    
def unnesting(dataframe,column):
    
    df_aux = dataframe.explode(column)
    df_normal = pd.json_normalize(df_aux[column].dropna())

    df_aux.reset_index(inplace=True)
    df_normal.reset_index(inplace=True)
    dataframe = pd.concat([df_aux,df_normal],axis=1)
    dataframe.dropna(inplace=True)
    
    return dataframe

# This function changes dtype to float and deletes string values

def to_float(data):
    if pd.isna(data):
        return 0.0
        
    try:
        return float(data)
    except(ValueError, TypeError):
        return 0.0
    
# This function tell us the percentage of nulls from columns

def null_ratio(Dataset):
    null_ratio = ((Dataset.isnull().sum() / len(Dataset))*100).sort_values(ascending=False)
    return null_ratio

## First, let's begin cleaning 'steam_games.json.gz'

In [120]:
# Defining path where is data
path1 = 'Datasets/steam_games.json.gz'

In [121]:
# Reading the dataset from 'steam_games.json.gz' ↓↓↓
with gzip.open(path1, 'rt', encoding='utf-8') as file:
    df_games = pd.read_json(file, lines=True)

In [122]:
# Number of rows and columns in our dataset:
print(df_games.shape)

# Name of columns:
print(df_games.columns)

(120445, 13)
Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')


In [123]:
# We see the big picture of our dataset
df_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns"


In [124]:
# We calculate the null ratio from our dataset
nulls = null_ratio(df_games)
nulls

publisher       80.004982
developer       76.058782
genres          76.045498
release_date    75.035909
title           75.021794
price           74.463033
specs           73.876043
tags            73.455104
app_name        73.321433
reviews_url     73.321433
id              73.321433
url             73.319773
early_access    73.319773
dtype: float64

In [125]:
# We fill null values with 'No data'
df_games.fillna("No data", inplace=True)

In [126]:
# We calculate the null ratio again
nulls = null_ratio(df_games)
nulls

publisher       0.0
genres          0.0
app_name        0.0
title           0.0
url             0.0
release_date    0.0
tags            0.0
reviews_url     0.0
specs           0.0
price           0.0
early_access    0.0
id              0.0
developer       0.0
dtype: float64

In [127]:
# We delete the duplicates and see the quantity of rows and columns

df_games.drop_duplicates(subset='id', inplace=True)
df_games.shape

(32133, 13)

In [128]:
# We see our dataset again
df_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,0.0,761140.0,Kotoshiro
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,0.0,643980.0,Secret Level SRL
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,0.0,670290.0,Poolians.com
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,0.0,767400.0,彼岸领域
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns"


In [129]:
# We unnest the list from our column 'genres' in our dataset
genres = df_games.explode('genres')

In [130]:
# We see how many values there are in the unnested column
genres['genres'].value_counts()

genres
Indie                        15858
Action                       11319
Casual                        8282
Adventure                     8242
Strategy                      6957
Simulation                    6699
RPG                           5479
No data                       3283
Free to Play                  2031
Early Access                  1462
Sports                        1257
Massively Multiplayer         1108
Racing                        1083
Design &amp; Illustration      460
Utilities                      340
Web Publishing                 268
Animation &amp; Modeling       183
Education                      125
Video Production               116
Software Training              105
Audio Production                93
Photo Editing                   77
Accounting                       7
Name: count, dtype: int64

In [131]:
# We see the type of 'genres' and its shape
print(type(genres))
print(genres.shape)
genres.columns

<class 'pandas.core.frame.DataFrame'>
(74834, 13)


Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [132]:
# We do the same with our original dataframe
print(type(df_games))
print(df_games.shape)
df_games.columns

<class 'pandas.core.frame.DataFrame'>
(32133, 13)


Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [133]:
# We see the null ratio of our new dataframe 'genres' and we delete the duplicates
print(null_ratio(genres))
genres.drop_duplicates(subset='id', inplace=True)
print(genres.shape)
# Now our dataframe 'genres' has the same shape as our original dataframe 'df_games'

publisher       0.0
genres          0.0
app_name        0.0
title           0.0
url             0.0
release_date    0.0
tags            0.0
reviews_url     0.0
specs           0.0
price           0.0
early_access    0.0
id              0.0
developer       0.0
dtype: float64
(32133, 13)


In [134]:
# Let's the dtype of our columns
genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32133 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     32133 non-null  object
 1   genres        32133 non-null  object
 2   app_name      32133 non-null  object
 3   title         32133 non-null  object
 4   url           32133 non-null  object
 5   release_date  32133 non-null  object
 6   tags          32133 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         32133 non-null  object
 9   price         32133 non-null  object
 10  early_access  32133 non-null  object
 11  id            32133 non-null  object
 12  developer     32133 non-null  object
dtypes: object(13)
memory usage: 3.4+ MB


In [135]:
# Deleting useless columns for our future EDA:

genres.drop(['url','reviews_url','specs', 'id'],axis=1,inplace=True)

# I use inplace=True because I want to change the original dataframe too

# We reset the index:
genres.reset_index(inplace=True)
 
# And then, we delete it
genres.drop(columns='index', inplace=True)

In [136]:
# We see the resulting rows and columns in our dataframe 'genres'
print(genres.columns)
print(genres.shape)

Index(['publisher', 'genres', 'app_name', 'title', 'release_date', 'tags',
       'price', 'early_access', 'developer'],
      dtype='object')
(32133, 9)


In [137]:
# We see some wrong data in price and release_year
print(genres['price'], '\n', '\n')
print(genres['release_date'])

0             No data
1                4.99
2        Free To Play
3        Free to Play
4                0.99
             ...     
32128            1.99
32129            4.99
32130            1.99
32131            4.99
32132            4.99
Name: price, Length: 32133, dtype: object 
 

0           No data
1        2018-01-04
2        2018-01-04
3        2017-07-24
4        2017-12-07
            ...    
32128    2018-01-04
32129    2018-01-04
32130    2018-01-04
32131    2017-09-02
32132       No data
Name: release_date, Length: 32133, dtype: object


In [138]:
# Let's change dtype object to float and datetime in 'price' and 'release_date'
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32133 entries, 0 to 32132
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     32133 non-null  object
 1   genres        32133 non-null  object
 2   app_name      32133 non-null  object
 3   title         32133 non-null  object
 4   release_date  32133 non-null  object
 5   tags          32133 non-null  object
 6   price         32133 non-null  object
 7   early_access  32133 non-null  object
 8   developer     32133 non-null  object
dtypes: object(9)
memory usage: 2.2+ MB


In [139]:
# Let's use our function 'to_float' to transform the column 'price'
genres['price'] = genres['price'].apply(to_float)

# Let's use our function 'datetime_change' to transform the column 'release_date'
genres['release_date'] = genres['release_date'].apply(datetime_change)

  return pd.to_datetime(var)


In [140]:
# Here we see the change in Dtype of 'price' and 'release_date'
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32133 entries, 0 to 32132
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   publisher     32133 non-null  object        
 1   genres        32133 non-null  object        
 2   app_name      32133 non-null  object        
 3   title         32133 non-null  object        
 4   release_date  29892 non-null  datetime64[ns]
 5   tags          32133 non-null  object        
 6   price         32133 non-null  float64       
 7   early_access  32133 non-null  object        
 8   developer     32133 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 2.2+ MB


In [141]:
# Now we create a new column named 'release_year' to extract only the year
genres['release_year'] = genres['release_date'].dt.year

In [142]:
# Now let's see the result
genres['release_year']

0           NaN
1        2018.0
2        2018.0
3        2017.0
4        2017.0
          ...  
32128    2018.0
32129    2018.0
32130    2018.0
32131    2017.0
32132       NaN
Name: release_year, Length: 32133, dtype: float64

In [143]:
# Now we check that everything is ok
print(genres.info(), '\n', '\n')
print(genres.shape, '\n')
print(genres.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32133 entries, 0 to 32132
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   publisher     32133 non-null  object        
 1   genres        32133 non-null  object        
 2   app_name      32133 non-null  object        
 3   title         32133 non-null  object        
 4   release_date  29892 non-null  datetime64[ns]
 5   tags          32133 non-null  object        
 6   price         32133 non-null  float64       
 7   early_access  32133 non-null  object        
 8   developer     32133 non-null  object        
 9   release_year  29892 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(7)
memory usage: 2.5+ MB
None 
 

(32133, 10) 

Index(['publisher', 'genres', 'app_name', 'title', 'release_date', 'tags',
       'price', 'early_access', 'developer', 'release_year'],
      dtype='object')


In [144]:
genres['release_year']

0           NaN
1        2018.0
2        2018.0
3        2017.0
4        2017.0
          ...  
32128    2018.0
32129    2018.0
32130    2018.0
32131    2017.0
32132       NaN
Name: release_year, Length: 32133, dtype: float64

In [145]:
# We see the null ratio again
nulls = null_ratio(genres)
nulls

release_date    6.974139
release_year    6.974139
publisher       0.000000
genres          0.000000
app_name        0.000000
title           0.000000
tags            0.000000
price           0.000000
early_access    0.000000
developer       0.000000
dtype: float64

In [146]:
# We fill null data with 'No data'
genres.fillna("No data", inplace=True)
nulls = null_ratio(genres)
nulls

publisher       0.0
genres          0.0
app_name        0.0
title           0.0
release_date    0.0
tags            0.0
price           0.0
early_access    0.0
developer       0.0
release_year    0.0
dtype: float64

In [147]:
# Last but not least let's save our dataframe to a csv file

Genres = genres.to_csv('Games.csv', index=False)

## Second Step: Let's clean 'user_reviews.json.gz'

In [148]:
# Defining the path
path2 = 'Datasets/user_reviews.json.gz'

# Reading dataset
reviews = read_path(path2)


# Transforming dataset to dataframe ↓:
df_reviews = pd.DataFrame(reviews)

In [149]:
# For first, we see its shape and columns
print(df_reviews.shape)
print(df_reviews.columns)

(25799, 3)
Index(['user_id', 'user_url', 'reviews'], dtype='object')


In [150]:
# Now, we see the dataframe
df_reviews

# We can detect in our dataframe, in 'reviews' column, it has nested values

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [151]:
# We're going to unnest nested values, in this case 'reviews'
df_reviews = unnesting(df_reviews, "reviews")

In [152]:
df_reviews

Unnamed: 0,index,user_id,user_url,reviews,index.1,funny,posted,last_edited,item_id,helpful,recommend,review
0,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20...",0.0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011....",1.0,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011...",2.0,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014....",3.0,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2...",4.0,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...,...,...,...
59300,25776,Fuckfhaisjnsnsjakaka,http://steamcommunity.com/id/Fuckfhaisjnsnsjakaka,"{'funny': '', 'posted': 'Posted January 17.', ...",59300.0,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59301,25777,3214213216,http://steamcommunity.com/id/3214213216,"{'funny': '', 'posted': 'Posted February 7.', ...",59301.0,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59302,25778,ChrisCoroner,http://steamcommunity.com/id/ChrisCoroner,"{'funny': '', 'posted': 'Posted May 5.', 'last...",59302.0,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59303,25779,CaptainAmericaCw,http://steamcommunity.com/id/CaptainAmericaCw,"{'funny': '1 person found this review funny', ...",59303.0,,Posted July 20.,,730,No ratings yet,True,:D


In [154]:
# Let's see its shape and columns 
print(df_reviews.shape)
print(df_reviews.columns)

# Now we realize that we have more rows and columns

(59277, 12)
Index(['index', 'user_id', 'user_url', 'reviews', 'index', 'funny', 'posted',
       'last_edited', 'item_id', 'helpful', 'recommend', 'review'],
      dtype='object')


In [155]:
# We see if we have nulls
nulls = null_ratio(df_reviews)
nulls

index          0.0
user_id        0.0
user_url       0.0
reviews        0.0
index          0.0
funny          0.0
posted         0.0
last_edited    0.0
item_id        0.0
helpful        0.0
recommend      0.0
review         0.0
dtype: float64

In [156]:
# Let's see the date
df_reviews['posted']

0         Posted November 5, 2011.
1            Posted July 15, 2011.
2           Posted April 21, 2011.
3            Posted June 24, 2014.
4        Posted September 8, 2013.
                   ...            
59300              Posted July 10.
59301               Posted July 8.
59302               Posted July 3.
59303              Posted July 20.
59304               Posted July 2.
Name: posted, Length: 59277, dtype: object

### We are going to transform the date

In [157]:
# This piece of code deletes the word 'Posted' and extracts the month, day and year
df_reviews["posted"] = df_reviews["posted"].str.extract(r"Posted ([\w\s\d,]+)") 

In [158]:
df_reviews['posted']

0         November 5, 2011
1            July 15, 2011
2           April 21, 2011
3            June 24, 2014
4        September 8, 2013
               ...        
59300              July 10
59301               July 8
59302               July 3
59303              July 20
59304               July 2
Name: posted, Length: 59277, dtype: object

In [159]:
# This line of code rearrange the date and converts name's month to number, all of this using our function 'datetime_change'
df_reviews["posted_date"] = df_reviews["posted"].apply(datetime_change) 

In [160]:
# We see the result
df_reviews['posted_date']

0       2011-11-05
1       2011-07-15
2       2011-04-21
3       2014-06-24
4       2013-09-08
           ...    
59300          NaT
59301          NaT
59302          NaT
59303          NaT
59304          NaT
Name: posted_date, Length: 59277, dtype: datetime64[ns]

In [161]:
# Now we create a new column and we extract the year
df_reviews['posted_year'] = df_reviews["posted_date"].dt.year

In [162]:
# Here is the result
df_reviews['posted_year']

0        2011.0
1        2011.0
2        2011.0
3        2014.0
4        2013.0
          ...  
59300       NaN
59301       NaN
59302       NaN
59303       NaN
59304       NaN
Name: posted_year, Length: 59277, dtype: float64

In [163]:
# Let's see the null ratio again
nulls = null_ratio(df_reviews)
nulls

posted_date    17.063954
posted_year    17.063954
index           0.000000
user_id         0.000000
user_url        0.000000
reviews         0.000000
index           0.000000
funny           0.000000
posted          0.000000
last_edited     0.000000
item_id         0.000000
helpful         0.000000
recommend       0.000000
review          0.000000
dtype: float64

In [164]:
# Let's fill null values with 'No data'
df_reviews.fillna("No data", inplace=True)

In [170]:
# Let's see our name columns and choose what to delete
print(df_reviews.columns, '\n', '\n')
df_reviews

Index(['index', 'user_id', 'user_url', 'reviews', 'index', 'funny', 'posted',
       'last_edited', 'item_id', 'helpful', 'recommend', 'review',
       'posted_date', 'posted_year'],
      dtype='object') 
 



Unnamed: 0,index,user_id,user_url,reviews,index.1,funny,posted,last_edited,item_id,helpful,recommend,review,posted_date,posted_year
0,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20...",0.0,,"November 5, 2011",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05 00:00:00,2011.0
1,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011....",1.0,,"July 15, 2011",,22200,No ratings yet,True,It's unique and worth a playthrough.,2011-07-15 00:00:00,2011.0
2,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011...",2.0,,"April 21, 2011",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,2011-04-21 00:00:00,2011.0
3,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014....",3.0,,"June 24, 2014",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24 00:00:00,2014.0
4,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2...",4.0,,"September 8, 2013",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,2013-09-08 00:00:00,2013.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59300,25776,Fuckfhaisjnsnsjakaka,http://steamcommunity.com/id/Fuckfhaisjnsnsjakaka,"{'funny': '', 'posted': 'Posted January 17.', ...",59300.0,,July 10,,70,No ratings yet,True,a must have classic from steam definitely wort...,No data,No data
59301,25777,3214213216,http://steamcommunity.com/id/3214213216,"{'funny': '', 'posted': 'Posted February 7.', ...",59301.0,,July 8,,362890,No ratings yet,True,this game is a perfect remake of the original ...,No data,No data
59302,25778,ChrisCoroner,http://steamcommunity.com/id/ChrisCoroner,"{'funny': '', 'posted': 'Posted May 5.', 'last...",59302.0,1 person found this review funny,July 3,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,No data,No data
59303,25779,CaptainAmericaCw,http://steamcommunity.com/id/CaptainAmericaCw,"{'funny': '1 person found this review funny', ...",59303.0,,July 20,,730,No ratings yet,True,:D,No data,No data


In [171]:
# Now we delete useless columns
df_reviews.drop(["reviews", "last_edited","index","posted"], axis=1, inplace=True)

In [177]:
# Let's take one last look
print(df_reviews.shape, '\n','\n')
df_reviews.info()

(59277, 9) 
 

<class 'pandas.core.frame.DataFrame'>
Index: 59277 entries, 0 to 59304
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      59277 non-null  object
 1   user_url     59277 non-null  object
 2   funny        59277 non-null  object
 3   item_id      59277 non-null  object
 4   helpful      59277 non-null  object
 5   recommend    59277 non-null  object
 6   review       59277 non-null  object
 7   posted_date  59277 non-null  object
 8   posted_year  59277 non-null  object
dtypes: object(9)
memory usage: 4.5+ MB


In [178]:
print(null_ratio(df_reviews))

user_id        0.0
user_url       0.0
funny          0.0
item_id        0.0
helpful        0.0
recommend      0.0
review         0.0
posted_date    0.0
posted_year    0.0
dtype: float64


In [179]:
# Finall saving the progress in a new .csv file
Reviews = df_reviews.to_csv("Reviews.csv", index=False)

## Third step: Cleaning 'users_items.json.gz'

In [None]:
# Defining the path and reading the data
path3 = 'Datasets/users_items.json.gz'
items = read_path(path3)

In [None]:
# Transforming the data to a dataframe
df_items = pd.DataFrame(items)

In [None]:
df_items.unstack()

In [None]:
df_items = unnesting(df_items, "items")

In [None]:
df_items.to_csv("DF Items.csv", index=False)

In [None]:
DF_Items = pd.read_csv("DF Items.csv")

In [None]:
DF_Items.drop(['items', 'index'], axis=1, inplace=True)

In [None]:
df_items.drop(['items', 'index'], axis=1, inplace=True)

In [None]:
df_items = df_items.to_parquet("Items.parquet", index=False)

In [None]:
DF_Items.to_csv("DF Items.csv", index=False)

Why did I choose Parquet? This is related to the structure of the dataset itself.


# General Information

In [None]:
DF_Items = pd.read_csv("DF Items.csv")
DF_Items.info()

In [None]:
Items = pd.read_parquet("Items.parquet")
Items.info()

In [None]:
Games = pd.read_csv('Games.csv')
Games.info()

In [None]:
Reviews = pd.read_csv("Reviews.csv")
Reviews.info()