# Task 2 : Binary Classification Problem

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## Loading Data Sets

In [2]:
train_data = pd.read_csv("training.csv", sep = ";", header = "infer")
valid_data = pd.read_csv("validation.csv", sep = ";", header = "infer")

In [3]:
train_data.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,a,1792,0.00054,u,g,c,v,175,f,t,1,t,g,80.0,5,800000.0,t,0,no.
1,b,1692,0.00335,y,p,k,v,29,f,f,0,f,s,200.0,0,2000000.0,,0,no.
2,b,3125,1125.0,u,g,ff,ff,0,f,t,1,f,g,96.0,19,960000.0,t,0,no.
3,a,4817,1335.0,u,g,i,o,335,f,f,0,f,g,0.0,120,0.0,,0,no.
4,b,3233,35.0,u,g,k,v,5,f,f,0,t,g,232.0,0,2320000.0,f,0,no.


In [4]:
valid_data.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,b,3233,75.0,u,g,e,bb,1585,t,f,0,t,s,420.0,0,4200000.0,,1,no.
1,b,2358,179.0,u,g,c,v,54,f,f,0,t,g,136.0,1,1360000.0,,0,no.
2,b,3642,0.00075,y,p,d,v,585,f,f,0,f,g,240.0,3,2400000.0,,1,no.
3,b,1842,10415.0,y,p,aa,v,125,t,f,0,f,g,120.0,375,1200000.0,,0,no.
4,b,245,13335.0,y,p,aa,v,4,f,f,0,t,g,120.0,475,1200000.0,f,1,no.


## Preprocessing

In [5]:
#Checking if there are any missing values
print train_data.isnull().values.any()
print valid_data.isnull().values.any()

True
True


In [6]:
#Checking which columns are already numerical
train_data.dtypes

variable1      object
variable2      object
variable3      object
variable4      object
variable5      object
variable6      object
variable7      object
variable8      object
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

In [7]:
valid_data.dtypes

variable1      object
variable2      object
variable3      object
variable4      object
variable5      object
variable6      object
variable7      object
variable8      object
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

In [8]:
#A list of the columns that are numerical but exist as objects
num_convert = ["variable2", "variable3", "variable8"]

#Convert those columns to be numerical in the dataframe
for col in num_convert:
    train_data[col] = train_data[col].astype("str")
    train_data[col] = train_data[col].str.replace(",", ".")
    train_data[col] = pd.to_numeric(train_data[col], errors = "coerce")
    
    valid_data[col] = valid_data[col].astype("str")
    valid_data[col] = valid_data[col].str.replace(",", ".")
    valid_data[col] = pd.to_numeric(valid_data[col], errors = "coerce")

In [9]:
train_data.dtypes

variable1      object
variable2     float64
variable3     float64
variable4      object
variable5      object
variable6      object
variable7      object
variable8     float64
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

In [10]:
valid_data.dtypes

variable1      object
variable2     float64
variable3     float64
variable4      object
variable5      object
variable6      object
variable7      object
variable8     float64
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

In [11]:
#Setting a list of the categorical columns and another to numerical ones
categoricals = ["variable1", "variable4", "variable5", "variable6", "variable7", 
                "variable9",  "variable10", "variable12", "variable13", "variable18", "classLabel"]
numerics = np.setdiff1d(train_data.columns.tolist(), categoricals)

In [12]:
#Fill missing data with the mean value of each numerical column
for col in numerics:
    train_data[col] = train_data[col].fillna((train_data[col].mean()))
    valid_data[col] = valid_data[col].fillna((valid_data[col].mean()))

In [13]:
#Fill missing data with forward fill for train_data, while backfill valid_data as the first row contains NANs
for col in categoricals:
    train_data[col] = train_data[col].fillna(method="ffill")
    valid_data[col] = valid_data[col].fillna(method="backfill")

In [14]:
#Checking that all missing values are filled
print train_data.isnull().values.any()
print valid_data.isnull().values.any()

False
False


In [15]:
#Add a Target column to contain the same values of classLabel but "yes" = 1 & "no" = 0
train_data['Target'] = train_data['classLabel'].apply(lambda x: 0 if x=='no.' else 1)
valid_data['Target'] = valid_data['classLabel'].apply(lambda x: 0 if x=='no.' else 1)

In [16]:
#Storing Target column in a separate list then remove them from both train_data and valid_data
train_target = train_data["Target"]
train_data = train_data.drop("classLabel", axis = 1)
train_data = train_data.drop("Target", axis = 1)

valid_target = valid_data["Target"]
valid_data = valid_data.drop("classLabel", axis = 1)
valid_data = valid_data.drop("Target", axis = 1)

In [17]:
#Remove classLabel from categoricals list as it doesn't exist in the data sets anymore
categoricals.remove("classLabel")
#Create matrices of 0s and 1s in place of the strings in the categorical columns
train_categ = pd.get_dummies(train_data[categoricals].astype(str))
valid_categ = pd.get_dummies(valid_data[categoricals].astype(str))

In [18]:
#Due to the difference of columns numbers between train_categ and valid_categ
#the missing columns are added but with 0 values
missing_cols = set(train_categ.columns) - set(valid_categ.columns)
for col in missing_cols:
    valid_categ[col] = 0

In [19]:
#Merge categorical and numerical data
train_merged = pd.merge(train_data[numerics], train_categ, left_index = True, right_index = True, how = "inner")
valid_merged = pd.merge(valid_data[numerics], valid_categ, left_index = True, right_index = True, how = "inner")

In [20]:
#To make sure that both data sets have their columns arranged in the same order
valid_merged = valid_merged[train_merged.columns]

## Logistic Regression Model

In [21]:
logistic_model = LogisticRegression()
logistic_model.fit(train_merged, train_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
valid_predictions = logistic_model.predict(valid_merged)

In [23]:
logistic_accuracy = accuracy_score(valid_target, valid_predictions)
logistic_precision = precision_score(valid_target, valid_predictions)
logistic_recall = recall_score(valid_target, valid_predictions)
logistic_f1 = f1_score(valid_target, valid_predictions)

print "Logistic Regression Model Accuracy = %f" % logistic_accuracy
print "Logistic Regression Model Precision = %f" % logistic_precision
print "Logistic Regression Model Recall = %f" % logistic_recall
print "Logistic Regression Model F1 = %f" % logistic_f1

Logistic Regression Model Accuracy = 0.465000
Logistic Regression Model Precision = 0.465000
Logistic Regression Model Recall = 1.000000
Logistic Regression Model F1 = 0.634812


## Nearest Neighbor Model

In [24]:
nn_model = KNeighborsClassifier(n_neighbors = 3)
nn_model.fit(train_merged, train_target)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [25]:
valid_predictions_2 = nn_model.predict(valid_merged)

In [26]:
nn_accuracy = accuracy_score(valid_target, valid_predictions_2)
nn_precision = precision_score(valid_target, valid_predictions_2)
nn_recall = recall_score(valid_target, valid_predictions_2)
nn_f1 = f1_score(valid_target, valid_predictions_2)

print "Nearest Neighbor Model Accuracy = %f" % nn_accuracy
print "Nearest Neighbor Model Precision = %f" % nn_precision
print "Nearest Neighbor Model Recall = %f" % nn_recall
print "Nearest Neighbor Model F1 = %f" % nn_f1

Nearest Neighbor Model Accuracy = 0.650000
Nearest Neighbor Model Precision = 0.596639
Nearest Neighbor Model Recall = 0.763441
Nearest Neighbor Model F1 = 0.669811
