In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

data=pd.read_csv('games.csv')
data.head()

#Normalize the column names
def to_snake(s):
    return (
        s.strip()
         .replace("/", " ")
         .replace("-", " ")
         .replace(".", " ")
         .replace("(", "")
         .replace(")", "")
         .lower()
         .replace(" ", "_")
    )

data.columns = [to_snake(c) for c in data.columns]

#Converting K to Numbers
def convert_k_to_number(x):
    if isinstance(x, str) and 'K' in x:
        return float(x.replace('K', '')) * 1000
    return int(x)
for col in ['times_listed', 'number_of_reviews', 'plays', 'playing', 'backlogs', 'wishlist']:
    if col in data.columns:
        data[col] = data[col].apply(convert_k_to_number)

#Converting Release date to number of days since release
data['release_date'] = pd.to_datetime(data['release_date'], errors='coerce')

data['time_since_release'] = (data['release_date'] - data['release_date'].min()).dt.days

#Find and remove duplicate values
before = len(data)
data.duplicated().sum()
data.drop_duplicates(inplace=True)
print(f"Removed {before - len(data)} exact duplicate rows")


#split to numerical and categorical
data_num=data.select_dtypes('float')
data_cat=data.select_dtypes('object')

#Handle the null values
for i in data_num.columns:
    median=data_num[i].median()
    data_num[i]=data_num[i].fillna(median)

for i in data_cat.columns:
    mode=data_cat[i].mode()[0]
    data_cat[i]=data_cat[i].fillna(mode)

#Genre column handling
def parse_genres(genre_string):
    try:
        return ast.literal_eval(genre_string)
    except (ValueError, SyntaxError):
        return []

data_cat['genres_list'] = data_cat['genres'].apply(parse_genres)

# Split into main genre and sub-genres
def split_genre_subgenre(genre_list):
    if genre_list:
        genre = genre_list[0]
        sub_genres = genre_list[1:]
        return genre, sub_genres
    else:
        return None, []

genre_data = data_cat['genres_list'].apply(split_genre_subgenre)

data_cat['genre'] = genre_data.apply(lambda x: x[0])
data_cat['sub_genres'] = genre_data.apply(lambda x: x[1])

# Drop the original 'genres' and 'genres_list' columns
data_cat = data_cat.drop(['genres', 'genres_list'], axis=1)

#Handle the teams column
def parse_teams(teams_string):
    try:
        # Remove brackets and quotes before splitting
        return ast.literal_eval(teams_string)
    except (ValueError, SyntaxError):
        return []

data_cat['team_list']= data_cat['team'].apply(parse_teams)

# Split into team and sub_team based on ':'
def split_team_subteam(team_list):
    teams = []
    sub_teams = []
    if team_list:
        teams = team_list[0]
        sub_teams = team_list[1:]

    else:
        teams.append(team_list.strip())
        sub_teams.append(None) # Or an empty string, depending on how you want to handle no sub-team
    return teams, sub_teams

team_data = data_cat['team_list'].apply(split_team_subteam)


data_cat['team'] = team_data.apply(lambda x: x[0])
data_cat['sub_team'] = team_data.apply(lambda x: x[1])

data_cat = data_cat.drop(['team_list'], axis=1)

#Merge the numerical and categorical data
merged_data = data_num.join(data_cat)

#Change the order of the columns as required
preferred_order = [c for c in [
        "title", "team", "sub_team", "summary", "main_genre", "sub_genres",
        "rating", "times_listed", "number_of_reviews", "plays", "playing",
        "backlogs", "wishlist", "time_since_release", "reviews"
             ] if c in merged_data.columns]
data = merged_data[[*preferred_order, *[c for c in merged_data.columns if c not in preferred_order]]]


data.to_csv('cleaned_games.csv', index=False)


print("Merged DataFrame created and saved to 'cleaned_games.csv'")