In [1]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# to make this notebook's output stable across runs
np.random.seed(0)

In [2]:
# Load test dataset
test_df = pd.read_csv("test-before.csv")
test_df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,class
0,5.88,0.4874,0.541,1.515,16.55,0.3458,class1
1,76.47,0.7286,0.6721,1.919,13.0,0.3308,class1
2,29.41,0.5879,?,0.0,0.0,0.5082,class1
3,29.41,0.5477,0.6148,2.626,0.0,0.5365,class1
4,17.65,0.794,0.623,3.636,28.96,?,class2


In [3]:
test_df = test_df.replace('?', np.nan)

In [4]:
# Pre-process dataset

# Replacing missing values with mean value of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(test_df.iloc[:, :-1])
test_df.iloc[:, :-1] = imputer.transform(test_df.iloc[:, :-1])

# Normalising the values between [0,1]
scaler = MinMaxScaler()
scaler.fit(test_df.iloc[:, :-1])
test_df.iloc[:, :-1] = scaler.transform(test_df.iloc[:, :-1])

# Changing the class values to 0 and 1 respectively
test_df = test_df.replace('class1', '0')
test_df = test_df.replace('class2', '1')
test_df["class"] = test_df["class"].astype(int)
test_df.head(25)

  test_df.iloc[:, :-1] = imputer.transform(test_df.iloc[:, :-1])


Unnamed: 0,a1,a2,a3,a4,a5,a6,class
0,0.062078,0.499949,0.541,0.207933,0.259404,0.061258,0
1,0.807327,0.747359,0.6721,0.263382,0.203762,0.058601,0
2,0.310494,0.603036,0.418738,0.0,0.0,0.090027,0
3,0.310494,0.561801,0.6148,0.360417,0.0,0.09504,0
4,0.186339,0.814443,0.623,0.499039,0.453918,0.159728,1
5,0.186339,0.603854,0.4754,0.152484,0.1,0.065474,1
6,0.683171,0.711355,0.623,0.0,0.0,0.087653,1
7,0.55891,0.525798,0.623,0.512901,0.0,0.086856,0
8,0.124155,0.463945,0.5574,0.582212,0.0,0.10085,1
9,0.248416,0.572161,0.5902,0.651523,0.383542,0.097945,0


In [5]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

x = test_df.drop('class', axis=1).values
y = test_df['class'].values

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            

print_data(x, y)

0.0621,0.4999,0.5410,0.2079,0.2594,0.0613,0
0.8073,0.7474,0.6721,0.2634,0.2038,0.0586,0
0.3105,0.6030,0.4187,0.0000,0.0000,0.0900,0
0.3105,0.5618,0.6148,0.3604,0.0000,0.0950,0
0.1863,0.8144,0.6230,0.4990,0.4539,0.1597,1
0.1863,0.6039,0.4754,0.1525,0.1000,0.0655,1
0.6832,0.7114,0.6230,0.0000,0.0000,0.0877,1
0.5589,0.5258,0.6230,0.5129,0.0000,0.0869,0
0.1242,0.4639,0.5574,0.5822,0.0000,0.1009,1
0.2484,0.5722,0.5902,0.6515,0.3835,0.0979,0


In [6]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [7]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

def logregClassifier(X, y):
    logreg = LogisticRegression(solver='liblinear')
    scores = cross_val_score(logreg, X, y, cv=cvKFold)
    return scores.mean()

x = test_df.iloc[:, :-1]
y = test_df.iloc[:, -1]

print("Average cross-validation score for logistic regression: {:.4f}".format(logregClassifier(x, y)))

Average cross-validation score for logistic regression: 0.6510


In [8]:
#Naïve Bayes
from sklearn.naive_bayes import GaussianNB


def nbClassifier(X, y):
    nb = GaussianNB()
    scores = cross_val_score(nb, X, y, cv=cvKFold)
    return scores.mean()

x = test_df.iloc[:, :-1]
y = test_df.iloc[:, -1]

print("Average cross-validation score for naive bayes: {:.4f}".format(nbClassifier(x, y)))

Average cross-validation score for naive bayes: 0.6555
