# Importing All the necessary Library

In [None]:
import sklearn
import pandas as pd
import nltk
from ast import literal_eval
import re
import numpy as np
import pickle,gzip,pickletools

# Reading the CSV File and checking the info

In [None]:
df = pd.read_csv("Final_game_dataset.csv")
df.head()

In [None]:
df.columns

In [None]:
df.info()

# Dropping the unnecesary columns

In [None]:
df = df.drop(columns = ["Metacritic","Rating_Top","Ratings","Parent_Platforms","ESRB_Rating"])

In [None]:
df.head()

# Converting the stringfiled obj to desirable obj

In [None]:
features = ["Platforms","Developers","Publishers","Genres","Tags"]

for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [None]:
df["Platforms"][1]

# Filling the missing values

In [None]:
columns = ['Released',"Clean_description"]

for column in columns:
    df[column] = df[column].fillna("")

In [None]:
df.info()

# Creating everything in list of strings

In [None]:
columns = ["Developers","Publishers","Genres","Tags"]

def get_list(obj):
    
    if isinstance(obj,list):
        
        names = [i["name"] for i in obj]
        return names
    return []  # return empty list for malfromed data


# now apply the function

for column in columns:
    df[column] = df[column].apply(get_list)

In [None]:
# for platfroms it is different than others

def get_plat_list(obj):
    
    if isinstance(obj,list):
        
        platform = [i["platform"] for i in obj]
        
        names = [i["name"] for i in platform]
        
        return names
    return []

df['Platforms'] = df['Platforms'].apply(get_plat_list)

In [None]:
df.head()

# Creating Index with names

In [None]:
indices = pd.Series(df.index,index =  df["Name"]).drop_duplicates()
indices

# Creating a function to call the model and get result

In [None]:
def game_name(name,cos_sim):
    idx = indices[name]
    similar_score = cos_sim[idx]
    similar_game = list(enumerate(similar_score))
    sort_game = sorted(similar_game,key= lambda x:x[1], reverse=True)
    sort_game_10 = sort_game[1:11]
    game_indices = [i[0] for i in sort_game_10]
    return df["Name"].iloc[game_indices]

# Creating The Model

### cleaning the description

In [None]:
def clean(string):
    string = str(string)
    clean_str = re.sub(r'[^\w\s]', '', string) # get rid of punctuation
    return clean_str


df["Clean_description"] = df["Clean_description"].apply(clean)

## Creating the soup (adding all the column string value together)

In [None]:
df_copy = df.copy()  # deep copy

In [None]:
def convert_to_string(obj):
    string = str(obj)
    string = string.replace("-"," ")
    return string


df_copy["Released"] = df_copy["Released"].apply(convert_to_string)

In [None]:
df_copy["Released"]

### Preparing the data for soup

In [None]:
# clean the data (remove space and lowercase)

def clean_data(obj):
    if isinstance(obj,list):
        if len(obj)==0: # for empty list
            return ""
        else:
            return [str.lower(i.replace(" ",'')) for i in obj]
    else:
        if isinstance(obj,str):
            if len(obj)==0: # for empty string
                return ""
            else:
                return str.lower(obj.replace(" ",''))
        else:
            return ""                

In [None]:
columns = ["Name","Platforms","Developers","Publishers","Genres","Tags","Clean_description"]

for column in columns:
    df_copy[column] = df_copy[column].apply(clean_data)

In [None]:
df_copy.head()

### The Soup

In [None]:
def soup_with_dis(df_obj):
    return (df_obj["Name"]) + " "+ " ".join(df_obj["Platforms"])+ " "+ " ".join(df_obj["Developers"])+ " "+ " ".join(df_obj["Publishers"])+" "+" ".join(df_obj["Genres"])+" "+" ".join(df_obj["Tags"])+" "+df_obj["Released"]+" "+df_obj["Clean_description"]

df_copy["Soup_with_dis"] = df_copy.apply(soup_with_dis,axis=1)

In [None]:
df_copy["Soup_with_dis"][6127]

### Passing the Soup in the model

In [None]:
# Create the model 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count_vec = CountVectorizer(stop_words="english")

count_matrix = count_vec.fit_transform(df_copy["Soup_with_dis"])
cos_sim_dis_soup = cosine_similarity(count_matrix)
cos_sim_dis_soup

## Calling the function to check the model

In [None]:
game_name("The Last of Us Part II",cos_sim_dis_soup)

# Saving the model using pickle

In [None]:
file_path = "cos_sim_discri_soup.pkl"
with gzip.open(file_path,"wb") as f:
    pickled = pickle.dumps(cos_sim_dis_soup)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)