In [83]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import json
import re
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [72]:
project_path = '/content/drive/MyDrive/Colab Notebooks/PatternMining'

# Load dataset

In [73]:
# file_path = f'{project_path}/datasets/all_video_games(cleaned).csv'
file_path = f'{project_path}/datasets/merged.csv'
data = pd.read_csv(file_path)

# PreProcess

In [74]:
# Set the allowable limit for missing values
allowable_limit = len(data) * 0.8  # You can adjust this threshold as per your preference

# Iterate over each column
for column in data.columns:
    # Count the missing values in each column
    missing_count = data[column].isnull().sum()

    # If missing values exceed the allowable limit, drop the column
    if missing_count > allowable_limit:
        data = data.drop(column, axis=1)
        print(f"Dropped column '{column}' due to exceeding allowable limit of missing values.")
    else:
        # For numerical columns, fill missing values with mean
        if data[column].dtype in ['int64', 'float64']:
            data.loc[:, column] = data[column].fillna(data[column].mean())
        else:
            data.loc[:, column] = data[column].fillna(data[column].mode()[0])


print("Missing values filled successfully.")
# print(data)


Dropped column 'jp_sales' due to exceeding allowable limit of missing values.
Missing values filled successfully.


# Manual preProcess

## Sales process

In [75]:
max_value = data['total_sales'].max()
min_value = data['total_sales'].min()
for column in ['total_sales', 'pal_sales', 'other_sales']:
    data[column] = (data[column] - min_value) / (max_value - min_value)
    # print(data[column])

In [76]:
data['other_sales'] = data['total_sales'] - data['pal_sales'] - data['na_sales']

## Platforms info process

In [77]:
def compute_max_metascore_info(informations):
    informations = informations.replace("'", '"')
    informations = json.loads(informations)
    max_metascore = 0
    max_metascore_count = 0
    for info in informations:
        pattern = r'\d+'

        metascore_count = 0
        metascore = 0

        if 'Platform Metascore Count' in info:
            match = re.search(pattern, info['Platform Metascore Count'])
            if match:
                metascore_count = int(match.group())

        if 'Platform Metascore' in info:
            match = re.search(pattern, info['Platform Metascore'])
            if match:
                metascore = int(match.group())

        if metascore_count > max_metascore_count:
            max_metascore_count = metascore_count
            max_metascore = metascore

    return max_metascore_count, max_metascore

In [78]:
# Apply the function to each row in the 'platforms info' column
max_metascore_info = data['platforms info'].apply(compute_max_metascore_info)

# Split the resulting tuples into separate columns
data[['max_metascore_count', 'max_metascore']] = pd.DataFrame(max_metascore_info.tolist(), index=data.index)

data.drop(columns=['platforms info'], inplace=True)
# Display the updated DataFrame
# print(data)

# Outlires

In [80]:
def remove_outliers(data, column):
  # Calculate the quantiles for Winsorization
  q1 = data[column].quantile(0.25)
  q3 = data[column].quantile(0.75)

  # Define the threshold for extreme values
  threshold = 1.5 * (q3 - q1)

  # Apply Winsorization
  data[column] = data[column].clip(lower=data[column].quantile(0.05), upper=data[column].quantile(0.95))
  return data

In [81]:
data = remove_outliers(data, 'na_sales')
data = remove_outliers(data, 'pal_sales')
data = remove_outliers(data, 'other_sales')
data = remove_outliers(data, 'max_metascore')
data = remove_outliers(data, 'max_metascore_count')

# Save dataset

In [82]:
data.to_csv(f'{project_path}/datasets/merged_cleaned.csv', index=False)

# Load Cleaned

In [51]:
data = pd.read_csv(f'{project_path}/datasets/merged_cleaned.csv')

In [84]:
data_one_hot = pd.get_dummies(data)

In [85]:
print(data_one_hot.head())

   critic_score  total_sales  na_sales  pal_sales  other_sales  user score  \
0      9.400000     1.000000  0.520000    0.01624    -0.485217         8.4   
1      9.700000     0.954232  0.520000    0.01624    -0.485217         8.4   
2      7.368743     0.780512  0.520000    0.01624    -0.485217         8.4   
3      9.000000     0.429134  0.520000    0.01624    -0.485217         8.4   
4      7.368743     0.025265  0.309307    0.00923    -0.293273         8.4   

   user ratings count  max_metascore_count  max_metascore  \
0              9576.0                   66             89   
1              9576.0                   66             89   
2              9576.0                   66             89   
3              9576.0                   66             89   
4              9576.0                   66             89   

   img_/games/boxart/1000581ccc.jpg  ...  genres splitted_['Visual', 'Novel']  \
0                             False  ...                                False   
1 

In [None]:
# Ensure all values are binary (0 or 1)
data_one_hot = data_one_hot.applymap(lambda x: 1 if x >= 1 else 0)

In [None]:



# Perform frequent pattern extraction using apriori algorithm
min_support_threshold = 0.1
frequent_itemsets = apriori(data_one_hot, min_support=min_support_threshold, use_colnames=True)

# Display the frequent itemsets
print(frequent_itemsets)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the rules
print(rules)


  and should_run_async(code)


     support                                           itemsets
0   0.832586                                       (User Score)
1   0.803913                               (User Ratings Count)
2   0.115831      (Product Rating_Rated E +10 For Everyone +10)
3   0.244397              (Product Rating_Rated E For Everyone)
4   0.148630                (Product Rating_Rated M For Mature)
5   0.272857                  (Product Rating_Rated T For Teen)
6   0.803059                   (User Ratings Count, User Score)
7   0.100747  (Product Rating_Rated E +10 For Everyone +10, ...
8   0.182782  (Product Rating_Rated E For Everyone, User Score)
9   0.142938    (Product Rating_Rated M For Mature, User Score)
10  0.244895      (Product Rating_Rated T For Teen, User Score)
11  0.170046  (User Ratings Count, Product Rating_Rated E Fo...
12  0.141089  (User Ratings Count, Product Rating_Rated M Fo...
13  0.237211  (User Ratings Count, Product Rating_Rated T Fo...
14  0.169833  (Product Rating_Rated E Fo