In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/KyleTy1er/Build-Week-Unit-2/master/fighters_df.csv', index_col=0)

In [33]:
# my_lambdata/my_mod.py

# Check data frame for nulls:

import pandas as pd
import numpy as np


def null_checker(a):
    # Creating a list of all columns containing a null value:
    null_columns=a.columns[a.isnull().any()]
    # Subsetting the dataframe to retain only the columns with null values
    # Summing and sorting those null values by their column from
    # least to greatest and limiting output to top 50 results
    return a[null_columns].isnull().sum().sort_values(ascending=True).head(50)


# Function to set multiple notebook display options:

def display_mod():
    # Allows output of all columns in a pd.DataFrame
    pd.set_option('display.max_columns', None)
    # Allows output of all rows in a pd.DataFrame
    pd.set_option('display.max_rows', None)
    # Allows display of full column list when using df.columns
    pd.options.display.max_seq_items = None


# Function to retain numeric/low cardinality features while
# dropping high cardinality features and the target variable:

def feature_keeper(dataframe, target):
    # Get a dataframe with all train columns except the target
    train_features = dataframe.drop(columns=[target])

    # Get a list of the numeric features
    numeric_features = train_features.select_dtypes(include='number').columns.tolist()

    # Get a series with the cardinality of the nonnumeric features
    cardinality = train_features.select_dtypes(exclude='number').nunique()

    # Get a list of all categorical features with cardinality <= 50
    categorical_features = cardinality[cardinality <= 50].index.tolist()

    # Combine the lists
    features = numeric_features + categorical_features

    return features

# Class that performs some common model preparation tasks


class Model_Prep():

    def __init__(self, dataframe, target=None):
        self.dataframe = dataframe
        self.target = target

    # Drop target variable in a dataframe
    def drop_target(self):
        train_features = self.dataframe.drop(columns=[self.target])
        return train_features

    # Get a list of the numeric features
    def numeric_features(self):
        numeric_features = self.dataframe.select_dtypes(include='number').columns.tolist()
        return numeric_features

    # Get a list of all categorical features with cardinality <= 50
    def cardinality(self):
        cardinality = self.dataframe.select_dtypes(exclude='number').nunique()
        categorical_features = cardinality[cardinality <= 50].index.tolist()
        return categorical_features

    # Combine list of categorical/numeric features
    def numeric_cat_combine(self):
        numeric_features = self.dataframe.select_dtypes(include='number').columns.tolist()
        cardinality = self.dataframe.select_dtypes(exclude='number').nunique()
        categorical_features = cardinality[cardinality <= 50].index.tolist()
        features = numeric_features + categorical_features
        return features


if __name__ == "__main__":

    df = pd.read_csv('https://raw.githubusercontent.com/KyleTy1er/Build-Week-Unit-2/master/fighters_df.csv', index_col=0)
    df_test = Model_Prep(df, 'is_winner')

    print (df_test.drop_target())
    print (df_test.numeric_features())
    print (df_test.cardinality())
    print (df_test.numeric_features())


In [34]:
df_test = Model_Prep(df,'is_winner')

In [35]:
df_test.drop_target()

Unnamed: 0,Height_cms,Reach_cms,Referee,Stance,Weight_lbs,age,avg_BODY_att,avg_BODY_landed,avg_CLINCH_att,avg_CLINCH_landed,...,total_rounds_fought_ratio,total_time_fought(seconds)_ratio,total_title_bouts_ratio,win_by_Decision_Majority_ratio,win_by_Decision_Split_ratio,win_by_Decision_Unanimous_ratio,win_by_KO/TKO_ratio,win_by_Submission_ratio,win_by_TKO_Doctor_Stoppage_ratio,wins_ratio
0,162.56,162.56,Marc Goddard,Orthodox,135.0,32.0,21.900000,16.400000,17.000000,11.000000,...,2.800000,1.768792,4.0,1.0,1.500000,5.000000,1.000000,0.5,1.0,1.800
1,165.10,167.64,Robert Madrigal,Southpaw,125.0,31.0,12.000000,7.714286,9.285714,6.857143,...,0.866667,1.250588,3.0,1.0,0.666667,1.500000,1.000000,3.0,0.5,1.200
2,180.34,193.04,Dan Miragliotta,Orthodox,155.0,35.0,13.866667,8.666667,2.866667,1.733333,...,0.492754,1.038652,1.5,1.0,2.000000,0.500000,0.363636,1.0,2.0,0.625
3,162.56,172.72,Kevin MacDonald,Orthodox,135.0,29.0,18.250000,10.250000,5.875000,4.125000,...,2.100000,1.058576,1.0,1.0,2.000000,1.666667,0.666667,1.0,1.0,1.400
4,187.96,190.50,Dan Miragliotta,Southpaw,264.0,26.0,7.750000,6.750000,11.000000,7.250000,...,0.888889,0.367818,1.0,1.0,1.000000,1.000000,3.000000,1.0,1.0,2.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5139,193.04,,Joao Alberto Barreto,Orthodox,275.0,,4.000000,3.000000,9.000000,4.000000,...,1.000000,9.666667,1.0,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.000
5140,187.96,,Joao Alberto Barreto,Orthodox,225.0,30.0,,,,,...,1.000000,,1.0,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.000
5141,185.42,,Joao Alberto Barreto,Orthodox,196.0,30.0,,,,,...,1.000000,,1.0,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.000
5142,195.58,,Joao Alberto Barreto,Orthodox,250.0,,,,,,...,1.000000,,1.0,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.000


In [36]:
df_test.numeric_features()

['Height_cms',
 'Reach_cms',
 'Weight_lbs',
 'age',
 'avg_BODY_att',
 'avg_BODY_landed',
 'avg_CLINCH_att',
 'avg_CLINCH_landed',
 'avg_DISTANCE_att',
 'avg_DISTANCE_landed',
 'avg_GROUND_att',
 'avg_GROUND_landed',
 'avg_HEAD_att',
 'avg_HEAD_landed',
 'avg_KD',
 'avg_LEG_att',
 'avg_LEG_landed',
 'avg_PASS',
 'avg_REV',
 'avg_SIG_STR_att',
 'avg_SIG_STR_landed',
 'avg_SIG_STR_pct',
 'avg_SIG_STatt',
 'avg_SIG_STlanded',
 'avg_SIG_STpct',
 'avg_SUATT',
 'avg_SUB_ATT',
 'avg_TD_att',
 'avg_TD_landed',
 'avg_TD_pct',
 'avg_TOTAL_STR_att',
 'avg_TOTAL_STR_landed',
 'avg_TOTAL_STatt',
 'avg_TOTAL_STlanded',
 'avg_opp_BODY_att',
 'avg_opp_BODY_landed',
 'avg_opp_CLINCH_att',
 'avg_opp_CLINCH_landed',
 'avg_opp_DISTANCE_att',
 'avg_opp_DISTANCE_landed',
 'avg_opp_GROUND_att',
 'avg_opp_GROUND_landed',
 'avg_opp_HEAD_att',
 'avg_opp_HEAD_landed',
 'avg_opp_KD',
 'avg_opp_LEG_att',
 'avg_opp_LEG_landed',
 'avg_opp_PASS',
 'avg_opp_REV',
 'avg_opp_SIG_STR_att',
 'avg_opp_SIG_STR_landed',
 'avg

In [37]:
df_test.cardinality()

['Stance', 'is_winner', 'Stance_opponent', 'title_bout', 'weight_class']

In [38]:
df_test.numeric_cat_combine()

['Height_cms',
 'Reach_cms',
 'Weight_lbs',
 'age',
 'avg_BODY_att',
 'avg_BODY_landed',
 'avg_CLINCH_att',
 'avg_CLINCH_landed',
 'avg_DISTANCE_att',
 'avg_DISTANCE_landed',
 'avg_GROUND_att',
 'avg_GROUND_landed',
 'avg_HEAD_att',
 'avg_HEAD_landed',
 'avg_KD',
 'avg_LEG_att',
 'avg_LEG_landed',
 'avg_PASS',
 'avg_REV',
 'avg_SIG_STR_att',
 'avg_SIG_STR_landed',
 'avg_SIG_STR_pct',
 'avg_SIG_STatt',
 'avg_SIG_STlanded',
 'avg_SIG_STpct',
 'avg_SUATT',
 'avg_SUB_ATT',
 'avg_TD_att',
 'avg_TD_landed',
 'avg_TD_pct',
 'avg_TOTAL_STR_att',
 'avg_TOTAL_STR_landed',
 'avg_TOTAL_STatt',
 'avg_TOTAL_STlanded',
 'avg_opp_BODY_att',
 'avg_opp_BODY_landed',
 'avg_opp_CLINCH_att',
 'avg_opp_CLINCH_landed',
 'avg_opp_DISTANCE_att',
 'avg_opp_DISTANCE_landed',
 'avg_opp_GROUND_att',
 'avg_opp_GROUND_landed',
 'avg_opp_HEAD_att',
 'avg_opp_HEAD_landed',
 'avg_opp_KD',
 'avg_opp_LEG_att',
 'avg_opp_LEG_landed',
 'avg_opp_PASS',
 'avg_opp_REV',
 'avg_opp_SIG_STR_att',
 'avg_opp_SIG_STR_landed',
 'avg