# Import các thư viện cần thiết

In [64]:
import pandas as pd
import re
import json 

# **Reading Data from file**

In [65]:
# Read the CSV file into a DataFrame
match_df = pd.read_csv('./data/data02.csv')

In [66]:
# Xóa bỏ cột định danh
match_df = match_df.drop(columns=['puuid'])

In [67]:
# Function to reformat names
def format_augment_name(augment):
    # Split the string by '_'
    parts = augment.split('_')
    # Get the last element
    formatted_name = parts[-1]
    # Use regular expression to split each word
    formatted_name = ' '.join(re.findall('[A-Z][^A-Z]*', formatted_name))
    return formatted_name

# Reformat data in columns augments_1, augments_2, augments_3
for col in ['augments_1', 'augments_2', 'augments_3']:
    match_df[col] = match_df[col].apply(format_augment_name)

In [68]:
# Process 16 columns of trait names
def format_trait_name(trait):
    # Split the string by '_'
    parts = trait.split('_')
    # Get the last element
    formatted_name = parts[-1]
    return formatted_name

# Reformat data in 16 columns traits
for i in range(1, 17):
    col = f'traits_{i}_name'
    match_df[col] = match_df[col].apply(format_trait_name)

In [69]:
# Process column of character names
def format_character_name(units):
    # Split the string by '_'
    parts = units.split('_')
    # Get the last element
    formatted_name = parts[-1]
    return formatted_name

# Reformat data in 16 columns of character names
for i in range(1, 11):
    col = f'units_{i}_character_id'
    match_df[col] = match_df[col].apply(format_character_name)

# Rename columns
for i in range(1, 11):
    old_col_name = f'units_{i}_character_id'
    new_col_name = f'units_{i}_name'
    match_df.rename(columns={old_col_name: new_col_name}, inplace=True)

### Define function

In [70]:
# Function to reformat item names
def format_item_name(x: str):
    # Split the string by '_'
    parts = x.split('_')
    # Get the last element
    formatted_name = parts[-1]
    # Use regular expression to split each word
    formatted_name = ' '.join(re.findall('[A-Z][^A-Z]*', formatted_name))
    return formatted_name

# Reformat data in columns units_{1-10}_item_{1-3}
for unit in range(1, 11):
    for item in range(1, 4):
        col_name = f'units_{unit}_item_{item}'
        match_df[col_name] = match_df[col_name].apply(format_item_name)


# Split the dataset for easier handling

In [71]:
# Augments
augments = match_df[['augments_1', 'augments_2', 'augments_3', 'placement']]

# Traits
traits_df = match_df[[f'traits_{i}_name' for i in range(1, 17)] +
                     [f'traits_{i}_num_units' for i in range(1, 17)] +
                     [f'traits_{i}_style' for i in range(1, 17)] +
                     [f'traits_{i}_tier_current' for i in range(1, 17)] +
                     [f'traits_{i}_tier_total' for i in range(1, 17)]]

# Units
units_df = match_df[[f'units_{i}_name' for i in range(1, 11)] +
                    [f'units_{i}_item_{j}' for i in range(1, 11) for j in range(1, 4)]]


# **Convert data from string to numeric format**

## Process augmentation data

In [72]:
# # Read data in standard core name format
# id_name = pd.read_csv('data/Id_augment.csv')
# name_type = pd.read_csv('data/Type.csv')

In [73]:
id_name_type = pd.read_csv('data/Id_name_type.csv')

### Convert to dict

In [74]:
id_name = id_name_type.loc[: , ['id', 'name']]
name_type = id_name_type.loc[: , ['name', 'Type']]

In [75]:
mapping_name = id_name.to_dict(index=['id'], orient='records')
mapping_name = {row['id']: row['name'] for row in mapping_name}

mapping_type = name_type.to_dict(index=['name'], orient='records')
mapping_type = {row['name'].strip(): row['Type'] for row in mapping_type}

In [76]:
# Read available JSON data to retrieve information for converting from core ID to name
with open('data/Name.json', 'r') as f:
    temp = json.load(f)

names = {}
for i in temp['data']:
    names[i] = temp['data'][i]['name']

### Define function

In [77]:
def replace_suffix(x:str):
    """
    Function to replace suffixes in a string.

    Parameters:
    - x (str): The input string.

    Returns:
    - str: The string with suffixes replaced.
    """
    x = x.replace('III','').replace('II','').replace('++','').replace('+','').strip()
    if x.endswith('I'):
        return x[:-1].strip()
    return x

def update_value(val):
    """
    Function to update a value based on predefined mappings.

    Parameters:
    - val: The input value to be updated.

    Returns:
    - Updated value according to predefined mappings.
    """
    if val in names:
        return names[val]
    elif val in mapping_name:
        return mapping_name[val]
    else:
        return val


In [78]:
augments = augments.map(update_value)
augments = augments.map(lambda x: str(x).replace('++','').replace('+','').strip())

In [79]:
all_augments = mapping_type.keys()
all_augments = list(set(map(replace_suffix, all_augments)))
all_augments.sort()

# Create a new dataframe with columns for all augments
augments_df = pd.DataFrame(0, index=augments.index, columns=all_augments)

# Special augment
augments.replace('Healing Orbs I I', 'Healing Orbs II', inplace=True)

In [80]:
# Assign values to the new dataframe
for i in range(len(augments)):
    for j in range(3):
        ag = augments[f'augments_{j + 1}'][i]
        if ag.endswith('I'):
            count = ag.count('I')
            ag = replace_suffix(ag)
            augments_df.loc[i, ag] += count
        elif ag in mapping_type:
            augments_df.loc[i, ag] += mapping_type[ag]

## Process traits data

In [81]:
# Process for traits_{i}_name for i from 1 to 16
trait_name_columns = [f'traits_{i}_name' for i in range(1, 17)]
traits_df = traits_df[trait_name_columns]

# Get all unique traits names
unique_traits_names = pd.unique(traits_df.values.ravel('K'))

# Create a new dataframe with columns for each unique trait name
traits_encoding_df = pd.DataFrame(0, index=traits_df.index, columns=unique_traits_names)

# Populate the new dataframe by setting 1 if the trait name appears in any of the trait name columns for a row in traits_df

check = True
for idx, row in traits_df.iterrows():
    for col in trait_name_columns:
        tier = col.replace('name', '') + 'tier_current'
        if pd.notna(row[col]):
            traits_encoding_df.at[idx, row[col]] = match_df[tier][idx]

## Process units data

In [82]:
# Process for units_k_name where k ranges from 1 to 10
unit_name_columns = [f'units_{i}_name' for i in range(1, 11)]
unique_unit_names = pd.unique(units_df[unit_name_columns].values.ravel('K'))

# Create a new dataframe with columns for each unique unit name
units_encoding_df = pd.DataFrame(0, index=units_df.index, columns=unique_unit_names)

# Populate the new dataframe by setting its tier if the unit name appears in any of the unit name columns for a row in units_df
for idx, row in units_df.iterrows():
    for col in unit_name_columns:
        tier = col.replace('name', '') + 'tier'
        if pd.notna(row[col]):
            units_encoding_df.at[idx, row[col]] = match_df[tier][idx]

## Process Items data

In [83]:
# Define the correct columns for unit items based on the provided pattern
unit_item_columns = [f'units_{i}_item_{j}' for i in range(1, 11) for j in range(1, 4)]

# Filter out columns that don't exist in the dataframe
unit_item_columns = [col for col in unit_item_columns if col in units_df.columns]

# Extract all unique items from these columns
unique_unit_items = pd.unique(units_df[unit_item_columns].values.ravel('K'))
unique_unit_items = [item for item in unique_unit_items if not pd.isnull(item)]

# Create a new dataframe with columns for each unique unit item
units_items_df = pd.DataFrame(0, index=units_df.index, columns=unique_unit_items)

# Populate the new dataframe
for idx, row in units_df.iterrows():
    for col in unit_item_columns:
        if pd.notna(row[col]):
            units_items_df.at[idx, row[col]] = 1

# **Merge data**

In [84]:
# Remaining data (Which already is numeric)
remaining = match_df[match_df.columns.difference(augments.columns).difference(traits_df.columns).difference(units_df.columns)]

# Copy Remaining data to other dataframe 
df = remaining.copy()

# Drop do not need fields (Have already using to assign value in other field 
drop_columns = [x for x in list(remaining.columns) if (x.find('tier') != -1 or x.find('style') != -1 or x.find('_num_units') != -1)]

# Dropping field
df.drop(columns=drop_columns, inplace=True)

In [85]:
# Concat all parts of data into a single dataframe
final_data = pd.concat([df, augments_df, traits_encoding_df, units_encoding_df, units_items_df,match_df['placement']], axis=1)

In [86]:
# Final data
final_data

Unnamed: 0,level,A Cut Above,Accomplice,Altruist Crest,Altruist Crown,Arcanist,Arcanist Crest,Arcanist Crown,Ascension,AtWhatCost,...,Bramble Vest Radiant,Steraks Gage Radiant,Spectral Gauntlet Radiant,Jeweled Gauntlet Radiant,Quicksilver Radiant,Guardian Angel Radiant,A P,Support,A S,placement
0,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,7
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,7,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
4,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9587,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
9588,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9589,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
9590,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


# **Data to csv**

In [87]:
final_data.to_csv('data/final.csv', index=False, header=True)