**Note: In order to use these functions, you have to import pcorrseponding packages**

## Data Science Functions

Author: Junze He

### Exploratory data analysis

Packages: matplotlib, statsmodels, ConfusionMatrixDisplay

#### Graphs

In [24]:
# plot the confusion matrix
def confusionMatrixPlot(cm, labels):
    # cm = model.pred_table(threshold=0.5)
    cm_dis = ConfusionMatrixDisplay(cm, display_labels=labels)
    cm_dis.plot(cmap=plt.cm.Blues)

# The function takes a dataframe arugument to generate a matrix count plot @ Modified
def maxtrixCountPlot(input_data):
    rows = 4
    cols = 3
    row_count = col_count = 0
    fig, axe = plt.subplots(rows, cols, figsize=(20,15))
    
    for i in input_data.columns:
        sns.countplot(x = input_data[i],
                      ax=axe[row_count, col_count],
                      hue = input_data[i],
                      palette="flare")
        col_count += 1
    
        if col_count >= cols:
            col_count = 0
            row_count += 1
    
    fig.delaxes(axe[3, 1])
    fig.delaxes(axe[3, 2])

#### Statsmodels

In [21]:
# statsmodels for logistic Regression
def statsLogModel(x, y):
    x = sm.add_constant(x)
    statsLogitModel = sm.Logit(y, x).fit()
    
    return statsLogitModel

### Data Preprocessing

Packages: standardScaler, RandomOverSampler, RandomUnderSampler

In [13]:
# split data types into numerical and categorical data
    # returns two items
def split_types(data):
    numerical_features = data.select_dtypes(["int64", "float64"])
    categorical_features = data.select_dtypes(["object", "category"])
    
    return numerical_features, categorical_features

# scale data
def scale(x):
    scaler = StandardScaler()
    scaler.fit(x)
    scaled_x = scaler.transform(x)
    return scaled_x

Author: Giovanni Cinque

In [20]:
# oversampling data
def oversampling(x, y):
    ros = RandomOverSampler()
    x, y = ros.fit_resample(x, y)
    return x, y

# undersampling data
def undersampling(x,y):
    rus = RandomUnderSampler()
    x, y = rus.fit_resample(x,y)
    return x, y

Author: Junze He

### Feature Engineering

Author Junze He

Packages: OrdinalEncoder, OrdinalEncoder, numpy

In [25]:
# Encoding Functions
  # encode the categorical features into ordinal numbers
def ordinal_catfeatures_encoder(data):
    encoder = OrdinalEncoder()
    encoded_data = encoder.fit_transform(data)
    encoded_dataframe =  pd.DataFrame(encoded_data)
    encoded_dataframe.columns = data.columns
    
    return encoded_dataframe

  # one-hot encoding
    # encode the categorical features into dummy variables and return encoder as well
def onehot_eoncoder(data):
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoded_data = encoder.fit_transform(data)
    encoded_dataframe = pd.DataFrame(encoded_data.toarray())
    
    return encoded_dataframe, encoder

  # Frequency encoding
def frequency_encoder(data, is_normalize = True):
    length = data.shape[1]
    columns = data.columns
    new_data = data.copy()
    
    for i in range(length):
        frequency = new_data[columns[i]].value_counts(normalize=True)
        new_data[columns[i]] = new_data[columns[i]].map(frequency)
    
    return new_data;

### Models

Packages: Pipeline,  ColumnTransforme, rconfusion_matrix, ConfusionMatrixDispla

  accuracy_scor, t classification_repo, rt SimpleImpu, ort cross_val_s, port Shuffl, train_test_spliteSplit

In [28]:
# The pipelineModel function returns a model, confusion matrix, summary table, and validation score.

def pipelineModel(model, x, y, ordinalEncoder=True):
    # shuffle and set corss validation to n = 5 folds
    cv = ShuffleSplit(n_splits=5, test_size = 0.2, random_state=0)
    
    # divide data into train and test
    train_x, test_x, train_y, test_y = train_test_split(
      x,
      y,
      test_size=0.2,
      shuffle=True
      )
    
    # impute missing values with median in numerical variables
    numerical_transformer = SimpleImputer(strategy="median")
    
    # impute missing values with most frequent value in categorical variables
    # encode categorical variable by ordinal numbers
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoder_title = "One Hot Encoding"
    
    if ordinalEncoder:
        encoder = OrdinalEncoder()
        encoder_title = "Oridnal Encoding"
        
    categorical_transformer = Pipeline(
      steps=[
      ("Imputer", SimpleImputer(strategy="most_frequent")),
      (encoder_title, encoder)
      ]
    )
    
    # use columnTransformer to combine preprocessing steps in both numerical and categorical variables
    preprocessor = ColumnTransformer(
      transformers=[
          ("num", numerical_transformer, numerical_cols),
          ("cat", categorical_transformer, categorical_cols),
      ]
    )
    
    # after preprocess data, we scale variables into the same unit, then create a model
    my_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
         ('scaler', StandardScaler()),
        ("model", model)
    ]
    )
    
    my_pipeline.fit(train_x, train_y)
    
    # get predictions
    predictions = my_pipeline.predict(test_x)
    # get confusion matrix
    cm = confusion_matrix(predictions, test_y)
    # produce a summary of the model
    report = classification_report(predictions, test_y)
    # test the model by corss validation
    validation_score = cross_val_score(my_pipeline, train_x, train_y, cv=cv)
    
    return my_pipeline, cm, report, validation_score