In [1]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# to make this notebook's output stable across runs
np.random.seed(0)

In [2]:
# Load test dataset
test_df = pd.read_csv("test-before.csv")
test_df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,class
0,5.88,0.4874,0.541,1.515,16.55,0.3458,class1
1,76.47,0.7286,0.6721,1.919,13.0,0.3308,class1
2,29.41,0.5879,?,0.0,0.0,0.5082,class1
3,29.41,0.5477,0.6148,2.626,0.0,0.5365,class1
4,17.65,0.794,0.623,3.636,28.96,?,class2


In [3]:
test_df = test_df.replace('?', 0)

In [4]:
# Pre-process dataset

# Replacing missing values with mean value of the column
imputer = SimpleImputer(missing_values=0, strategy='mean')
imputer = imputer.fit(test_df.iloc[:, :-1])
test_df.iloc[:, :-1] = imputer.transform(test_df.iloc[:, :-1])

# Normalising the values between [0,1]
scaler = MinMaxScaler()
scaler.fit(test_df.iloc[:, :-1])
test_df.iloc[:, :-1] = scaler.transform(test_df.iloc[:, :-1])

# Changing the class values to 0 and 1 respectively
test_df = test_df.replace('class1', '0')
test_df = test_df.replace('class2', '1')
test_df["class"] = test_df["class"].astype(int)
test_df.head(25)

  test_df.iloc[:, :-1] = imputer.transform(test_df.iloc[:, :-1])


Unnamed: 0,a1,a2,a3,a4,a5,a6,class
0,0.061495,0.291939,0.50608,0.207062,0.259102,0.058472,0
1,0.807207,0.642266,0.647154,0.262572,0.203437,0.055807,0
2,0.310066,0.437908,0.445,0.087128,0.03666,0.087327,0
3,0.310066,0.379521,0.585494,0.359714,0.03666,0.092355,0
4,0.185833,0.737255,0.594318,0.498489,0.453696,0.163708,1
5,0.185833,0.455998,0.435489,0.151553,0.099633,0.062701,1
6,0.682975,0.591285,0.594318,0.087128,0.03666,0.084946,1
7,0.558636,0.32854,0.594318,0.512366,0.03666,0.084146,0
8,0.123611,0.240959,0.523728,0.581753,0.03666,0.098182,1
9,0.24795,0.39419,0.559023,0.65114,0.383291,0.095269,0


In [5]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

x = test_df.drop('class', axis=1).values
y = test_df['class'].values

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            

print_data(x, y)

0.0615,0.2919,0.5061,0.2071,0.2591,0.0585,0
0.8072,0.6423,0.6472,0.2626,0.2034,0.0558,0
0.3101,0.4379,0.4450,0.0871,0.0367,0.0873,0
0.3101,0.3795,0.5855,0.3597,0.0367,0.0924,0
0.1858,0.7373,0.5943,0.4985,0.4537,0.1637,1
0.1858,0.4560,0.4355,0.1516,0.0996,0.0627,1
0.6830,0.5913,0.5943,0.0871,0.0367,0.0849,1
0.5586,0.3285,0.5943,0.5124,0.0367,0.0841,0
0.1236,0.2410,0.5237,0.5818,0.0367,0.0982,1
0.2479,0.3942,0.5590,0.6511,0.3833,0.0953,0


In [7]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

def logregClassifier(X, y):
    logreg = LogisticRegression(solver='liblinear')
    scores = cross_val_score(logreg, X, y, cv=cvKFold)
    return scores.mean()

x = test_df.iloc[:, :-1]
y = test_df.iloc[:, -1]

print("Average cross-validation score for logistic regression: {:.4f}".format(logregClassifier(x, y)))

Average cross-validation score for logistic regression: 0.65


In [9]:
#Naïve Bayes
from sklearn.naive_bayes import GaussianNB


def nbClassifier(X, y):
    nb = GaussianNB()
    scores = cross_val_score(nb, X, y, cv=cvKFold)
    return scores.mean()

x = test_df.iloc[:, :-1]
y = test_df.iloc[:, -1]

print("Average cross-validation score for naive bayes: {:.4f}".format(nbClassifier(x, y)))

Average cross-validation score for naive bayes: 0.6602
