# Importing The Libraries

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import json
import pandas as pd
import re

# Getting the game names [2000-2010]

In [None]:
years  = list(range(2000,2011)) # 2000-2010
game_list = [] # here will be store all the game names

for year in years:
    r = requests.get('https://en.wikipedia.org/wiki/'+str(year)+'_in_video_games')
    soup = bs(r.content)
    content = soup.prettify()
    
    # Get the specific table jan-dec game name(by italic tag)
    table = soup.findAll("table", {"class": "wikitable sortable"})
    game_names1 = table[0].findAll("i")     # findAll return whole thing as a one element list so I access that element by index
    game_names2 = table[1].findAll("i")
    game_names1_text = [k.get_text() for k in game_names1]
    game_names2_text = [k.get_text() for k in game_names2]
    if(len(game_names1_text)>len(game_names2_text)):
        game_names = game_names1_text
    else:
        game_names = game_names2_text
    game_list.extend(game_names)

# Getting the game names [2011-2021]

In [None]:
years = list(range(2011,2022))

## all the game names from 2011-2021 will be store as the same list (game_list) as the pervious game names were stored(2000-2010)
for year in years:
    r = requests.get('https://en.wikipedia.org/wiki/'+str(year)+'_in_video_games')
    soup = bs(r.content)
    content = soup.prettify()
    tid = 'January\\.E2\\.80\\.93March' ## header id of that table
    table_1 = soup.select_one("h3:has(span#{}) + table.wikitable".format(tid)) 
    game_names_1 = table_1.select("i")  ## getting all the italic tag of that table
    game_list.extend([k.get_text() for k in game_names_1])
    
        
    # table from april-june
    tid = 'April\\.E2\\.80\\.93June' ## header id of that table
    table_2 = soup.select_one("h3:has(span#{}) + table.wikitable".format(tid)) 
    game_names_2 = table_2.select("i")
    game_list.extend([k.get_text() for k in game_names_2])  ## extending to the game_names list (we dont use append cause it will add the whole list as an object)

    # table from july-september
    
    tid = 'July\\.E2\\.80\\.93September' ## header id of that table
    table_3 = soup.select_one("h3:has(span#{}) + table.wikitable".format(tid)) 
    game_names_3 = table_3.select("i")
    game_list.extend([k.get_text() for k in game_names_3])  ## extending to the game_names list

    # table from october-december
    
    tid = 'October\\.E2\\.80\\.93December' ## header id of that table
    table_4 = soup.select_one("h3:has(span#{}) + table.wikitable".format(tid)) 
    game_names_4 = table_4.select("i")
    game_list.extend([k.get_text() for k in game_names_4])  ## extending to the game_names list

# Calling The api and getting the info

In [None]:
api_url = "https://api.rawg.io/api/games"
key = "?key=e431b52cb8204c66b501a35ba93bd3ca"
api_game = "https://api.rawg.io/api/games/" # this is for getting information of an specific game

# all the necessary key to get the info from the response
list_of_keys = ['name_original', 'description','metacritic','rating','rating_top','ratings','parent_platforms','platforms','developers','genres','released','tags','publishers','esrb_rating','description_raw']
js_list = [] # this list will store all the filter response
error_name_list = [] # this list will sotre all the failed response game name

for i in game_list:
    try:
        url = api_url+key+"&search="+i   # searching the game
        response = requests.get(url)
        js_format = response.json()
        slug = js_format.get('results')[0].get("slug")  # getting the slug name so that we can get complete information of that game
        url2 = api_game+slug+key    # requesting with slug name it will return information of that game
        response_2 = requests.get(url2)
        js_format2 = response_2.json()
        js_lim = {key:js_format2.get(key) for key in list_of_keys}  # filltering the response so that we can only have the necessary info
        js_list.append(js_lim)  # adding the dict to the js_list 
    except:
        error_name_list.append(i) # adding any error game name to the list

# Creating DataFrame From The Information

In [None]:
## we will create the dataframe from the js_list which contains dict (or information of the games)


original_name = [each.get('name_original') for each in js_list]
rating = [each.get('rating') for each in js_list]
metacritic = [each.get('metacritic') for each in js_list]
rating_top = [each.get('rating_top') for each in js_list]
ratings = [each.get('ratings') for each in js_list]
parent_platforms = [each.get('parent_platforms') for each in js_list]
platforms = [each.get('platforms') for each in js_list]
developers = [each.get('developers') for each in js_list]
genres = [each.get('genres') for each in js_list]
released = [each.get('released') for each in js_list]
tags = [each.get('tags') for each in js_list]
publishers = [each.get('publishers') for each in js_list]
esrb_rating = [each.get('esrb_rating') for each in js_list]
description = [each.get("description") for each in js_list]


df = pd.DataFrame({"Name":original_name,
                      "Rating":rating,
                      "Metacritic": metacritic,
                      "Rating_Top":rating_top,
                      "Ratings":ratings,
                      "Parent_Platforms":parent_platforms,
                      "Platforms":platforms,
                      "Developers":developers,
                      "Publishers":publishers,
                      "Genres":genres,
                      "Released":released,
                      "Tags":tags,
                      "ESRB_Rating":esrb_rating,
                      "Description":description})

# Cleaning The Data

### Cleaning the description

In [None]:
def clean(string):
    # first convert it to string
    string = str(string)
    
    # cleans all the unicode character
    
    string = string.encode('ascii','ignore').decode()  # character like QuarantineÛ_ will become Quarantine_
    
    
    # remove mentions @abcd
    
    string = re.sub("@S+",' ',string)
    
    # Remove URL's
    
    string = re.sub("https*\S+"," ",string)
    
    # remove Hastags
    
    string = re.sub("#\S+"," ",string)
    
    # remove ticks and next character
    
    string = re.sub("\'\w+",'', string)
    
    # remove extra spaces
    
    string = re.sub("\s{2,}"," ",string)
    
    # remove special characters
    
    html_char_pat = re.compile(r'(.+)(&#[0-9]+)(.+)',re.S)
    string = html_char_pat.sub(r'\1\3',str(string))
    
    string = string.replace('<p>',' ').replace("</p>",'').replace("<ul>","").replace("</ul>","").replace("<li>","").replace("</li>","").replace("<h3>","").replace('</h3>','').replace("\n","").replace("<br/>"," ").replace("<hr>","").replace("\xa0"," ").replace("<br />","").replace("<pre>","").replace("<code>","").replace('</pre>',"").replace('</code>',"")
    
    return string

In [None]:
df["Clean_description"] =  df["Description"].apply(clean)
df =  df.drop(columns=["Description"])
df.info()

### Dropping the duplicates name

In [None]:
df = df.drop_duplicates(subset=["Name"])
df.info()

# Saving The DataFrame

In [None]:
df.to_csv("Final_game_dataset.csv",index=False)

# Saving All The Name as a JSON File

In [None]:
value = {"Name":list(df["Name"].values)}
with open("Name.json","w") as f:
    json.dump(value,f)