# Silver Layer 🥈

## Imports ⬇️

In [68]:
import pandas as pd
import numpy as np
import string
import datetime

## Load the Bronze Layer data 🔄

In [69]:
# parse_dates automatically creates datetime formats for columns specified
df = pd.read_parquet('../Medallion Architecture/bronze/bronze_transactions.parquet')

## Generalized Data Validation and Cleaning Functions 🛠️ 
**Avoid using inplace=True; assign the result back**  
[Dtype Documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics-dtypes)

In [70]:
def drop_null_columns(df, threshold=0.2):
    """Drop columns with more than threshold of null values."""
    null_percentage = df.isnull().mean()
    columns_to_keep = null_percentage[null_percentage < threshold].index

    if len(columns_to_keep) == len(df.columns):
        print('No columns removed')
    else:
        print(f'removing: {[c for c in df.columns if c not in columns_to_keep]}')
    return df[columns_to_keep]

In [71]:
def fill_missing_values(df, columns_defaults: dict):
    """Fill missing values with assigned defaults in columns_defaults dictionary."""
    for column, default_value in columns_defaults.items():
        df[column] = df[column].fillna(default_value)
    return df

In [72]:
def convert_column_types(df, columns_types: dict):
    """Convert columns to desired data types."""
    try:
        for column, dtype in columns_types.items():
            df[column] = df[column].astype(dtype)
        return df
    except Exception as e:
        print(f'{column} caused an issue')
        raise e

In [73]:
def remove_punctuation(df, columns: list):
    '''Remove punctuation from string type columns'''
    for c in columns:
        df.loc[:,c] = df[c].str.replace(r'[^\w\s]|_', '', regex=True)
    return df

In [74]:
def check_formats(df, expected_formats: dict):
    ''' check the schema of the table to validate formats'''
    incorrect_formats = []
    for column, datatype in df.dtypes.to_dict().items():
        exepected_type = expected_formats.get(column)
        if exepected_type != datatype:
            incorrect_formats.append((column, datatype, exepected_type))
    
    incorrect_columns = [c[0] for c in incorrect_formats]
    correct_format_count = len([c for c in df.columns if c not in incorrect_columns])
    if incorrect_formats:
        print('Below are incorrect formats')
        print('-' * 50)
        print(f'Correct Column Count: {correct_format_count}')
        return pd.DataFrame(incorrect_formats, columns=['Column', 'Actual', 'Expected'])
    else:
        print('Validation Complete, no discrepancies')

In [75]:
def check_similarity(word1: str, word2: str) -> float:
    '''Check the similarity between two words using Jaccard Similarity'''
    # Convert string to a set of characters
    word_set1 = set(word1)
    word_set2 = set(word2)

    # Get the count of how many characters are in both sets (The intersection)
    intersection = word_set1.intersection(word_set2)
    intersection_count = len(intersection)

    # Get the count of total unique characters of both sets
    total_char_count = len(word_set1.union(word_set2))

    # Return the percentage of the intersection against total unique characters
    similarity = intersection_count / total_char_count
    return similarity

In [76]:
def check_mispelling(dataframe: pd.DataFrame, column: str, similarity_threshold: float) -> pd.DataFrame:
    '''
    Input a pandas dataframe, a specific column, and similarity threshold from 0-1 to get all values that are similar
    dataframe: Pandas Dataframe
    column: String representing a column from Dataframe
    similarity_threshold: Float representing values from 0 to 1 or 0 to 100% similarity
    '''
    all_unique_values = list(set(dataframe[column].tolist()))

    similarity_list = []
    for n in range(len(all_unique_values)):
        value1 = all_unique_values[n]
        for n2 in range(n + 1, len(all_unique_values)):
            value2 = all_unique_values[n2]
            similarity = round(check_similarity(value1, value2), 4)
            if similarity >= similarity_threshold:
                similarity_list.append([value1, value2, similarity])
    return pd.DataFrame(similarity_list, columns=['name1', 'name2', 'similarity'])

In [77]:
def strip_leading_trailing_spaces(df, columns: list):
    """
    Remove leading and trailing spaces from specified string columns.
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()
    return df

In [78]:
def remove_decimals(df, columns: list):
    """
    Remove decimals from specified columns in the DataFrame, preserving the sign.
    For positive numbers, returns the floor.
    For negative numbers, returns the ceil (towards zero).
    """
    for col in columns:
        if col in df.columns:
            def _remove_decimals(number):
                if isinstance(number, (int, np.integer)):
                    return number
                elif isinstance(number, (float, np.floating)):
                    return int(number)
                else:
                    return number  # Leave unchanged if not numeric
            df[col] = df[col].apply(_remove_decimals)
    return df

In [79]:
def swapcase_titlecase_columns(df, columns: list):
    """
    For each specified column, swap the case of each character and convert to title case.
    Example: 'mARIO kART' -> 'Mario Kart'
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: string.capwords(x.lower().swapcase()) if isinstance(x, str) else x)
    return df

## Apply Validations and Cleaning 🧼

In [80]:
df.head()

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34.0,0.0,Yes,No,1-Up Mushroom,64,26,4.26,Mario Tennis Aces
1,PeachK,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149.0,4.0,No,No,"Red Shell, Super Star",335,40,5.0,Mario Tennis Aces
2,Waluigi,,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174.0,1.0,No,Yes,Green Shell,182,57,5.5,Mario Kart 8 Deluxe
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1.0,5.0,No,Yes,1-Up Mushroom,333,84,6.0,Super Mario Bros.
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28.0,2.0,Yes,No,"Red Shell, Banana Peel, Fire Flower",461,55,7.0,Super Mario World


In [81]:
df.isnull().sum()

Player Name                      9040
Team                             2960
World                               0
Vehicle Type                        0
Companion                           0
Kart Racing Rank                 9280
Platforming Rank                    0
Boss Battle Rank                    0
Power-Ups Used                      0
Kart Role                           0
Team Points                         0
Lives Lost                          0
Participation in Battle Mode        0
Mushroom Cup Participation      12400
Power-Ups Owned                     0
Coins Spent in Toad Town            0
Levels Completed                    0
Times Hit by Enemies                0
Primary Game                        0
dtype: int64

In [82]:
df.isnull().mean() * 100

Player Name                     18.833333
Team                             6.166667
World                            0.000000
Vehicle Type                     0.000000
Companion                        0.000000
Kart Racing Rank                19.333333
Platforming Rank                 0.000000
Boss Battle Rank                 0.000000
Power-Ups Used                   0.000000
Kart Role                        0.000000
Team Points                      0.000000
Lives Lost                       0.000000
Participation in Battle Mode     0.000000
Mushroom Cup Participation      25.833333
Power-Ups Owned                  0.000000
Coins Spent in Toad Town         0.000000
Levels Completed                 0.000000
Times Hit by Enemies             0.000000
Primary Game                     0.000000
dtype: float64

In [83]:
df.columns

Index(['Player Name', 'Team', 'World', 'Vehicle Type', 'Companion',
       'Kart Racing Rank', 'Platforming Rank', 'Boss Battle Rank',
       'Power-Ups Used', 'Kart Role', 'Team Points', 'Lives Lost',
       'Participation in Battle Mode', 'Mushroom Cup Participation',
       'Power-Ups Owned', 'Coins Spent in Toad Town', 'Levels Completed',
       'Times Hit by Enemies', 'Primary Game'],
      dtype='object')

In [84]:
# drop_null_columns(df, threshold=.2)

In [85]:
{c: '' for c in list(df.columns[df.isnull().mean() * 100 > 0])}

{'Player Name': '',
 'Team': '',
 'Kart Racing Rank': '',
 'Mushroom Cup Participation': ''}

In [86]:
fill_missing_values(
    df, 
    {
        'Player Name': 'Unknown Player',
        'Team': 'No Team',
        'Kart Racing Rank': 'Unknown Track',
        'Mushroom Cup Participation': 'Unknown'
    }
)

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34.0,0.000000,Yes,No,1-Up Mushroom,64,26,4.26,Mario Tennis Aces
1,PeachK,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149.0,4.000000,No,No,"Red Shell, Super Star",335,40,5.00,Mario Tennis Aces
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174.0,1.000000,No,Yes,Green Shell,182,57,5.50,Mario Kart 8 Deluxe
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1.0,5.000000,No,Yes,1-Up Mushroom,333,84,6.00,Super Mario Bros.
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28.0,2.000000,Yes,No,"Red Shell, Banana Peel, Fire Flower",461,55,7.00,Super Mario World
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Toadette,Green Caps,Star World,Mach 8,Luma,B,A,S,8,Drifter,-84.0,0.000000,No,Yes,"1-Up Mushroom, Super Mushroom",172,43,5.00,Super Mario Bros.
47996,Bowser Jr.,Dino Buddies,Donut Plains,Circuit Special,Koopa Troopa,C,C,B,13,Speedster,110.0,1.083565,No,Unknown,Fire Flower,11,72,9.00,Super Mario World
47997,Luigi,Green Caps,dONUT pLAINS,Biddybuggy,Shy Guy,A,D,D,27,Drifter,-38.0,4.000000,Yes,Unknown,Super Star,460,25,2.00,Super Mario Odyssey
47998,Toadette,Koopa Clan,mUSHROOM kINGDOM,Circuit Special,Yoshi,B,A,B,8,Item Specialist,111.0,0.000000,Yes,Yes,"Super Star, Banana Peel, Green Shell",82,39,4.00,Mario Kart 8 Deluxe


In [87]:
df.isnull().mean() * 100

Player Name                     0.0
Team                            0.0
World                           0.0
Vehicle Type                    0.0
Companion                       0.0
Kart Racing Rank                0.0
Platforming Rank                0.0
Boss Battle Rank                0.0
Power-Ups Used                  0.0
Kart Role                       0.0
Team Points                     0.0
Lives Lost                      0.0
Participation in Battle Mode    0.0
Mushroom Cup Participation      0.0
Power-Ups Owned                 0.0
Coins Spent in Toad Town        0.0
Levels Completed                0.0
Times Hit by Enemies            0.0
Primary Game                    0.0
dtype: float64

In [88]:
df

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34.0,0.000000,Yes,No,1-Up Mushroom,64,26,4.26,Mario Tennis Aces
1,PeachK,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149.0,4.000000,No,No,"Red Shell, Super Star",335,40,5.00,Mario Tennis Aces
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174.0,1.000000,No,Yes,Green Shell,182,57,5.50,Mario Kart 8 Deluxe
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1.0,5.000000,No,Yes,1-Up Mushroom,333,84,6.00,Super Mario Bros.
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28.0,2.000000,Yes,No,"Red Shell, Banana Peel, Fire Flower",461,55,7.00,Super Mario World
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Toadette,Green Caps,Star World,Mach 8,Luma,B,A,S,8,Drifter,-84.0,0.000000,No,Yes,"1-Up Mushroom, Super Mushroom",172,43,5.00,Super Mario Bros.
47996,Bowser Jr.,Dino Buddies,Donut Plains,Circuit Special,Koopa Troopa,C,C,B,13,Speedster,110.0,1.083565,No,Unknown,Fire Flower,11,72,9.00,Super Mario World
47997,Luigi,Green Caps,dONUT pLAINS,Biddybuggy,Shy Guy,A,D,D,27,Drifter,-38.0,4.000000,Yes,Unknown,Super Star,460,25,2.00,Super Mario Odyssey
47998,Toadette,Koopa Clan,mUSHROOM kINGDOM,Circuit Special,Yoshi,B,A,B,8,Item Specialist,111.0,0.000000,Yes,Yes,"Super Star, Banana Peel, Green Shell",82,39,4.00,Mario Kart 8 Deluxe


In [89]:
remove_punctuation(df, columns=['Kart Role'])

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34.0,0.000000,Yes,No,1-Up Mushroom,64,26,4.26,Mario Tennis Aces
1,PeachK,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149.0,4.000000,No,No,"Red Shell, Super Star",335,40,5.00,Mario Tennis Aces
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174.0,1.000000,No,Yes,Green Shell,182,57,5.50,Mario Kart 8 Deluxe
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1.0,5.000000,No,Yes,1-Up Mushroom,333,84,6.00,Super Mario Bros.
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28.0,2.000000,Yes,No,"Red Shell, Banana Peel, Fire Flower",461,55,7.00,Super Mario World
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Toadette,Green Caps,Star World,Mach 8,Luma,B,A,S,8,Drifter,-84.0,0.000000,No,Yes,"1-Up Mushroom, Super Mushroom",172,43,5.00,Super Mario Bros.
47996,Bowser Jr.,Dino Buddies,Donut Plains,Circuit Special,Koopa Troopa,C,C,B,13,Speedster,110.0,1.083565,No,Unknown,Fire Flower,11,72,9.00,Super Mario World
47997,Luigi,Green Caps,dONUT pLAINS,Biddybuggy,Shy Guy,A,D,D,27,Drifter,-38.0,4.000000,Yes,Unknown,Super Star,460,25,2.00,Super Mario Odyssey
47998,Toadette,Koopa Clan,mUSHROOM kINGDOM,Circuit Special,Yoshi,B,A,B,8,Item Specialist,111.0,0.000000,Yes,Yes,"Super Star, Banana Peel, Green Shell",82,39,4.00,Mario Kart 8 Deluxe


In [90]:
# df.loc[:, 'Team'] = df['Team'].str.capitalize()

In [91]:
df.head()

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34.0,0.0,Yes,No,1-Up Mushroom,64,26,4.26,Mario Tennis Aces
1,PeachK,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149.0,4.0,No,No,"Red Shell, Super Star",335,40,5.0,Mario Tennis Aces
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174.0,1.0,No,Yes,Green Shell,182,57,5.5,Mario Kart 8 Deluxe
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1.0,5.0,No,Yes,1-Up Mushroom,333,84,6.0,Super Mario Bros.
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28.0,2.0,Yes,No,"Red Shell, Banana Peel, Fire Flower",461,55,7.0,Super Mario World


In [92]:
# 'datetime64[ns]'
expected_formats = {
    'Boss Battle Rank': 'string',
    'Coins Spent in Toad Town': 'int32',
    'Companion': 'string',
    'Kart Racing Rank': 'string',
    'Kart Role': 'string',
    'Levels Completed': 'int32',
    'Lives Lost': 'int32',
    'Mushroom Cup Participation': 'bool',
    'Participation in Battle Mode': 'bool',
    'Platforming Rank': 'string',
    'Player Name': 'string',
    'Power-Ups Owned': 'string',
    'Power-Ups Used': 'int32',
    'Primary Game': 'string',
    'Team': 'string',
    'Team Points': 'int32',
    'Times Hit by Enemies': 'int32',
    'Vehicle Type': 'string',
    'World': 'string'
}

In [93]:
check_formats(df, expected_formats=expected_formats)

Below are incorrect formats
--------------------------------------------------
Correct Column Count: 0


Unnamed: 0,Column,Actual,Expected
0,Player Name,object,string
1,Team,object,string
2,World,object,string
3,Vehicle Type,object,string
4,Companion,object,string
5,Kart Racing Rank,object,string
6,Platforming Rank,object,string
7,Boss Battle Rank,object,string
8,Power-Ups Used,int64,int32
9,Kart Role,object,string


In [94]:
convert_column_types(df, columns_types=expected_formats)

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34,0,True,True,1-Up Mushroom,64,26,4,Mario Tennis Aces
1,PeachK,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149,4,True,True,"Red Shell, Super Star",335,40,5,Mario Tennis Aces
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174,1,True,True,Green Shell,182,57,5,Mario Kart 8 Deluxe
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1,5,True,True,1-Up Mushroom,333,84,6,Super Mario Bros.
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28,2,True,True,"Red Shell, Banana Peel, Fire Flower",461,55,7,Super Mario World
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Toadette,Green Caps,Star World,Mach 8,Luma,B,A,S,8,Drifter,-84,0,True,True,"1-Up Mushroom, Super Mushroom",172,43,5,Super Mario Bros.
47996,Bowser Jr.,Dino Buddies,Donut Plains,Circuit Special,Koopa Troopa,C,C,B,13,Speedster,110,1,True,True,Fire Flower,11,72,9,Super Mario World
47997,Luigi,Green Caps,dONUT pLAINS,Biddybuggy,Shy Guy,A,D,D,27,Drifter,-38,4,True,True,Super Star,460,25,2,Super Mario Odyssey
47998,Toadette,Koopa Clan,mUSHROOM kINGDOM,Circuit Special,Yoshi,B,A,B,8,Item Specialist,111,0,True,True,"Super Star, Banana Peel, Green Shell",82,39,4,Mario Kart 8 Deluxe


In [95]:
check_formats(df, expected_formats=expected_formats)

Validation Complete, no discrepancies


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48000 entries, 0 to 47999
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Player Name                   48000 non-null  string
 1   Team                          48000 non-null  string
 2   World                         48000 non-null  string
 3   Vehicle Type                  48000 non-null  string
 4   Companion                     48000 non-null  string
 5   Kart Racing Rank              48000 non-null  string
 6   Platforming Rank              48000 non-null  string
 7   Boss Battle Rank              48000 non-null  string
 8   Power-Ups Used                48000 non-null  int32 
 9   Kart Role                     48000 non-null  string
 10  Team Points                   48000 non-null  int32 
 11  Lives Lost                    48000 non-null  int32 
 12  Participation in Battle Mode  48000 non-null  bool  
 13  Mushroom Cup Par

In [97]:
check_mispelling(df, 'Player Name', .8)

Unnamed: 0,name1,name2,similarity
0,Wapluigi,Wualuigi,0.8571
1,Wapluigi,Waluigia,0.8571
2,Wapluigi,Waluligi,0.8571
3,Wapluigi,Wailuigi,0.8571
4,Wapluigi,Walguigi,0.8571
...,...,...,...
21346,Bowser SJr.,Bowser Jr.a,0.8182
21347,Bowser SJr.,BowseRr Jr.,0.8182
21348,Bowser JFr.,Bowser Jr.a,0.8182
21349,Bowser JFr.,BowseRr Jr.,0.8182


In [98]:
NAME_CHECK = 'Mario'
COL_NAME = 'Player Name'
df[df[COL_NAME].apply(lambda x: check_similarity(x, NAME_CHECK)) > .80][COL_NAME].value_counts()

Player Name
Mario     3060
Margio       3
MariXo       3
Marioz       2
Maryio       2
          ... 
MaWrio       1
MarioV       1
Marxio       1
MarioS       1
MarWio       1
Name: count, Length: 97, dtype: Int64

In [99]:
NAME_CHECK = 'Mario'
COL_NAME = 'Player Name'
df['mario_sim'] = df.loc[:,COL_NAME].apply(check_similarity,args=('Mario',))
df.loc[df['mario_sim'] > .82, ['Player Name', 'mario_sim']].value_counts()

Player Name  mario_sim
Mario        1.000000     3060
Margio       0.833333        3
MariXo       0.833333        3
MaLrio       0.833333        2
Mariro       1.000000        2
                          ... 
Mnario       0.833333        1
Mqario       0.833333        1
Mvario       0.833333        1
Mxario       0.833333        1
Myario       0.833333        1
Name: count, Length: 97, dtype: int64

In [100]:
NAME_CHECK = 'Mario'
COL_NAME = 'Player Name'
df.loc[df[COL_NAME].apply(lambda x: check_similarity(x, NAME_CHECK)) > .80, COL_NAME] = 'Mario'

In [101]:
df[df[COL_NAME].apply(lambda x: check_similarity(x, NAME_CHECK)) > .80][COL_NAME].value_counts()

Player Name
Mario    3181
Name: count, dtype: Int64

In [102]:
# Want to rename values?
df2 = df.copy()
rename_values = {
    'DAaisy': 'Daisy'
}

df2['Player Name'] = df2['Player Name'].replace(rename_values)
sorted(df2['Player Name'].unique())

['BAowser Jr.',
 'BCowser',
 'BDowser Jr.',
 'BFowser',
 'BKowser',
 'BMowser Jr.',
 'BNowser',
 'BPowser',
 'BQowser',
 'BTowser Jr.',
 'BXowser',
 'BYowser Jr.',
 'Baowser',
 'Bbowser',
 'Beowser',
 'Beowser Jr.',
 'Bgowser',
 'Biowser',
 'Bkowser',
 'Bkowser Jr.',
 'Bmowser Jr.',
 'Bnowser',
 'BoAwser',
 'BoAwser Jr.',
 'BoCwser',
 'BoFwser',
 'BoGwser',
 'BoHwser',
 'BoKwser',
 'BoKwser Jr.',
 'BoLwser',
 'BoMwser',
 'BoMwser Jr.',
 'BoOwser',
 'BoQwser Jr.',
 'BoSwser',
 'BoVwser',
 'BoWwser',
 'Boawser',
 'Bobwser',
 'Bocwser Jr.',
 'Bodwser',
 'Boewser',
 'Bohwser',
 'Bojwser',
 'Bonwser',
 'Boowser',
 'Bopwser',
 'Borwser Jr.',
 'Boswser',
 'BowBser',
 'BowJser',
 'BowLser',
 'BowNser',
 'BowPser',
 'BowPser Jr.',
 'BowQser Jr.',
 'BowRser Jr.',
 'BowUser',
 'BowVser',
 'BowYser Jr.',
 'Bowaser',
 'Bowcser Jr.',
 'Boweser',
 'Boweser Jr.',
 'Bowfser',
 'Bowgser',
 'Bowkser',
 'Bownser',
 'Bowpser',
 'Bowqser',
 'BowsBer',
 'BowsCer',
 'BowsDer',
 'BowsGer',
 'BowsGer Jr.',
 'Bo

In [103]:
char = 'Peach'
COL_NAME = 'Player Name'
df.loc[:,COL_NAME].apply(lambda x : char if check_similarity(char, x) > .81 else x)

0             Yoshi
1             Peach
2           Waluigi
3             Yoshi
4        Bowser Jr.
            ...    
47995      Toadette
47996    Bowser Jr.
47997         Luigi
47998      Toadette
47999      Rosalina
Name: Player Name, Length: 48000, dtype: object

In [104]:
# Correct Names
CHARACTERS = [
    "Mario","Luigi","Peach","Daisy","Yoshi","Toad","Toadette",
    "Rosalina","Wario","Waluigi","Bowser","Bowser Jr."
]

COL_NAME = 'Player Name'
for char in CHARACTERS:
    df.loc[:,COL_NAME] = df.loc[:,COL_NAME].apply(lambda x : char if check_similarity(char, x) > .78 else x)

In [105]:
assert df['Player Name'].nunique() -1 == len(CHARACTERS), f'Player Names not cleaned, {len(CHARACTERS)} total, {df['Player Name'].nunique()} uniques'

In [106]:
# Clean leading and trailing spaces from relevant columns
columns_to_strip = ['Vehicle Type', 'World', 'Primary Game']
df = strip_leading_trailing_spaces(df, columns_to_strip)
df.head()

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game,mario_sim
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34,0,True,True,1-Up Mushroom,64,26,4,Mario Tennis Aces,0.25
1,Peach,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149,4,True,True,"Red Shell, Super Star",335,40,5,Mario Tennis Aces,0.1
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174,1,True,True,Green Shell,182,57,5,Mario Kart 8 Deluxe,0.222222
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1,5,True,True,1-Up Mushroom,333,84,6,Super Mario Bros.,0.25
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28,2,True,True,"Red Shell, Banana Peel, Fire Flower",461,55,7,Super Mario World,0.166667


In [107]:
# This will be automatically done during type casting
# df = remove_decimals(df, columns=['Team Points', 'Lives Lost', 'Times Hit by Enemies'])
# df.head()

In [108]:
swapcase_titlecase_columns(df, columns=['Team','Companion', 'World'])

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game,mario_sim
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,Polterpup,A,A,A,12,Drifter,-34,0,True,True,1-Up Mushroom,64,26,4,Mario Tennis Aces,0.250000
1,Peach,Green Caps,Donut Plains,Circuit Special,Koopa Troopa,C,A,B,16,Drifter,149,4,True,True,"Red Shell, Super Star",335,40,5,Mario Tennis Aces,0.100000
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174,1,True,True,Green Shell,182,57,5,Mario Kart 8 Deluxe,0.222222
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1,5,True,True,1-Up Mushroom,333,84,6,Super Mario Bros.,0.250000
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,Toad,C,C,B,10,Blocker,28,2,True,True,"Red Shell, Banana Peel, Fire Flower",461,55,7,Super Mario World,0.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Toadette,Green Caps,Star World,Mach 8,Luma,B,A,S,8,Drifter,-84,0,True,True,"1-Up Mushroom, Super Mushroom",172,43,5,Super Mario Bros.,0.222222
47996,Bowser Jr.,Dino Buddies,Donut Plains,Circuit Special,Koopa Troopa,C,C,B,13,Speedster,110,1,True,True,Fire Flower,11,72,9,Super Mario World,0.166667
47997,Luigi,Green Caps,Donut Plains,Biddybuggy,Shy Guy,A,D,D,27,Drifter,-38,4,True,True,Super Star,460,25,2,Super Mario Odyssey,0.125000
47998,Toadette,Koopa Clan,Mushroom Kingdom,Circuit Special,Yoshi,B,A,B,8,Item Specialist,111,0,True,True,"Super Star, Banana Peel, Green Shell",82,39,4,Mario Kart 8 Deluxe,0.222222


In [109]:
df.head()

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game,mario_sim
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,Polterpup,A,A,A,12,Drifter,-34,0,True,True,1-Up Mushroom,64,26,4,Mario Tennis Aces,0.25
1,Peach,Green Caps,Donut Plains,Circuit Special,Koopa Troopa,C,A,B,16,Drifter,149,4,True,True,"Red Shell, Super Star",335,40,5,Mario Tennis Aces,0.1
2,Waluigi,No Team,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174,1,True,True,Green Shell,182,57,5,Mario Kart 8 Deluxe,0.222222
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1,5,True,True,1-Up Mushroom,333,84,6,Super Mario Bros.,0.25
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,Toad,C,C,B,10,Blocker,28,2,True,True,"Red Shell, Banana Peel, Fire Flower",461,55,7,Super Mario World,0.166667


## Save as Parquet format to store column format meta data

In [110]:
df.to_parquet('../Medallion Architecture/silver/silver_transactions.parquet', index=False)

In [114]:
# --- Data Integrity Checks After Cleaning ---
import json
import numpy as np
import pandas as pd

def convert_for_json(obj):
    if isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    if isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    return obj

def get_metrics(df):
    metrics = {}
    metrics['rows'] = int(df.shape[0])
    metrics['columns'] = int(df.shape[1])
    metrics['unique_counts'] = {}
    metrics['sums'] = {}
    for col in df.columns:
        col_dtype = df[col].dtype
        if pd.api.types.is_categorical_dtype(col_dtype) or pd.api.types.is_object_dtype(col_dtype) or pd.api.types.is_string_dtype(col_dtype):
            metrics['unique_counts'][col] = int(df[col].nunique())
        elif pd.api.types.is_numeric_dtype(col_dtype):
            val = df[col].sum()
            if isinstance(val, (np.integer, np.floating)):
                val = val.item()
            metrics['sums'][col] = val
    return metrics

# Save metrics after cleaning
post_clean_metrics = get_metrics(df)

with open('silver_post_clean_metrics.json', 'w') as f_json:
    json.dump(post_clean_metrics, f_json, indent=2, default=convert_for_json)

metrics_flat = []
for col, val in post_clean_metrics['unique_counts'].items():
    metrics_flat.append({'column': col, 'type': 'unique_count', 'value': val})
for col, val in post_clean_metrics['sums'].items():
    metrics_flat.append({'column': col, 'type': 'sum', 'value': val})
metrics_flat.append({'column': 'ALL', 'type': 'rows', 'value': post_clean_metrics['rows']})
metrics_flat.append({'column': 'ALL', 'type': 'columns', 'value': post_clean_metrics['columns']})
df_metrics = pd.DataFrame(metrics_flat)
df_metrics.to_csv('silver_post_clean_metrics.csv', index=False)
print('Post-cleaning metrics saved to silver_post_clean_metrics.json and silver_post_clean_metrics.csv')

Post-cleaning metrics saved to silver_post_clean_metrics.json and silver_post_clean_metrics.csv


  if pd.api.types.is_categorical_dtype(col_dtype) or pd.api.types.is_object_dtype(col_dtype) or pd.api.types.is_string_dtype(col_dtype):
