In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [3]:
def lvec(a):
    n = a.shape[0]
    s = np.empty(0)
    for i in range(n):
        temp = a[i]*a[i:]
        temp[0] = temp[0]/2
        s = np.append(s,temp)
    return s

In [4]:
def create_z(x):
    s = lvec(x)
    z = np.empty(0)
    z = np.append(z,s)
    z = np.append(z,x)
    z = np.append(z,1)
    return z

In [5]:
def create_r(x):
    z = create_z(x)
    eta = lvec(z)
    s = lvec(x)
    r = np.empty(0)
    r = np.append(r,eta)
    r = np.append(r,s)
    return r

In [6]:
def new_feature(x):
    r = create_r(x)
    xnew = np.empty(0)
    xnew = np.append(xnew,r)
    xnew = np.append(xnew,x)
    xnew = np.append(xnew,1)
    return xnew

In [7]:
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped


def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

In [8]:
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [9]:
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw

In [10]:
def sgd(features, outputs):
    max_epochs = 5000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights

In [11]:
def init1():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    data = pd.read_csv('Creditnew1.csv')

    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[ 0]], axis=1, inplace=True)
    
    print("applying feature engineering...")
    # convert categorical labels to numbers
    diag_map = {'Caucasian': 1.0, 'Asian': -1.0,'African American': -1.0}
    data['Ethnicity'] = data['Ethnicity'].map(diag_map)
    
    # put features & outputs in different data frames
    Y = data.loc[:, 'Ethnicity']
    X = data.iloc[:, 0:-1]
    # filter features
    #remove_correlated_features(X)
    Xnew = np.apply_along_axis(new_feature, 1, X)
    #X = pd.DataFrame(Xnew)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    X_normalized = MinMaxScaler().fit_transform(Xnew)
    X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)
    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))

In [12]:
regularization_strength = 10000
learning_rate = 0.000001

In [13]:
# init1 assure if a data point belongs to class 1 or not #
# For Class 1 #
init1()

reading dataset...
applying feature engineering...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 317.3605253080274
Epoch is: 2 and Cost is: 62.85214773998804
Epoch is: 4 and Cost is: 61.06884485160673
Epoch is: 8 and Cost is: 1.941834326590808
Epoch is: 16 and Cost is: 1.931917538521283
training finished.
weights are: [-0.0056393  -0.00456564 -0.00474859 ... -0.02272368 -0.03981922
  0.        ]
testing the model...
accuracy on test dataset: 1.0
recall on test dataset: 1.0
precision on test dataset: 1.0


In [14]:
def init2():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    data = pd.read_csv('Creditnew1.csv')

    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[ 0]], axis=1, inplace=True)
    
    print("applying feature engineering...")
    # convert categorical labels to numbers
    diag_map = {'Caucasian': -1.0, 'Asian': 1.0,'African American': -1.0}
    data['Ethnicity'] = data['Ethnicity'].map(diag_map)
    
    # put features & outputs in different data frames
    Y = data.loc[:, 'Ethnicity']
    X = data.iloc[:, 0:-1]
    # filter features
    #remove_correlated_features(X)
    Xnew = np.apply_along_axis(new_feature, 1, X)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    X_normalized = MinMaxScaler().fit_transform(Xnew)
    X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))

In [15]:
# init2 assure if a data point belongs to class 2 or not #
# For Class 2 #
init2()

reading dataset...
applying feature engineering...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 304.7947413165209
Epoch is: 2 and Cost is: 1.5976060648165524
Epoch is: 4 and Cost is: 1.5955624362330902
training finished.
weights are: [-0.01103153 -0.01049946 -0.01045284 ...  0.01029596  0.00999176
  0.        ]
testing the model...
accuracy on test dataset: 1.0
recall on test dataset: 1.0
precision on test dataset: 1.0


In [16]:
def init3():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    data = pd.read_csv('Creditnew1.csv')

    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[ 0]], axis=1, inplace=True)
    
    print("applying feature engineering...")
    # convert categorical labels to numbers
    diag_map = {'Caucasian': -1.0, 'Asian': -1.0,'African American': 1.0}
    data['Ethnicity'] = data['Ethnicity'].map(diag_map)
    
    # put features & outputs in different data frames
    Y = data.loc[:, 'Ethnicity']
    X = data.iloc[:, 0:-1]
    # filter features
    #remove_correlated_features(X)
    Xnew = np.apply_along_axis(new_feature, 1, X)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    X_normalized = MinMaxScaler().fit_transform(Xnew)
    X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))

In [17]:
# init3 assure if a data point belongs to class 3 or not #
# For Class 3 #
init3()

reading dataset...
applying feature engineering...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 654.62764797652
Epoch is: 2 and Cost is: 3223.3008269045304
Epoch is: 4 and Cost is: 86.34673400908575
Epoch is: 8 and Cost is: 178.32938213910364
Epoch is: 16 and Cost is: 2.838735191051662
Epoch is: 32 and Cost is: 2.80981485307501
Epoch is: 64 and Cost is: 2.752855075403106
Epoch is: 128 and Cost is: 2.6423761307141684
Epoch is: 256 and Cost is: 2.63509275512056
training finished.
weights are: [ 0.01808558  0.01762709  0.01747769 ... -0.01906528 -0.00880496
  0.        ]
testing the model...
accuracy on test dataset: 0.9875
recall on test dataset: 1.0
precision on test dataset: 1.0


In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [2]:
df_train = pd.read_csv('Creditnew1.csv')

In [3]:
df_train

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Ethnicity,Balance,Gender_ Male,Gender_Female
0,0,14.891,3606,283,2,34,11,0,1,Caucasian,333,1,0
1,1,106.025,6645,483,3,82,15,1,1,Asian,903,0,1
2,2,104.593,7075,514,4,71,11,0,0,Asian,580,1,0
3,3,148.924,9504,681,3,36,11,0,0,Asian,964,0,1
4,4,55.882,4897,357,2,68,16,0,1,Caucasian,331,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,12.096,4100,307,3,32,13,0,1,Caucasian,560,1,0
396,396,13.364,3838,296,5,65,17,0,0,African American,480,1,0
397,397,57.872,4171,321,5,67,12,0,1,Caucasian,138,0,1
398,398,37.728,2525,192,1,44,13,0,1,Caucasian,0,1,0


In [4]:
df_train.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
#Converting the target variable into numeric (Yes:1, No:0)
df_train['Ethnicity']=df_train['Ethnicity'].replace({"Caucasian":-1,"Asian":0,"African American":1})

In [6]:
df_train

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Ethnicity,Balance,Gender_ Male,Gender_Female
0,14.891,3606,283,2,34,11,0,1,-1,333,1,0
1,106.025,6645,483,3,82,15,1,1,0,903,0,1
2,104.593,7075,514,4,71,11,0,0,0,580,1,0
3,148.924,9504,681,3,36,11,0,0,0,964,0,1
4,55.882,4897,357,2,68,16,0,1,-1,331,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.096,4100,307,3,32,13,0,1,-1,560,1,0
396,13.364,3838,296,5,65,17,0,0,1,480,1,0
397,57.872,4171,321,5,67,12,0,1,-1,138,0,1
398,37.728,2525,192,1,44,13,0,1,-1,0,1,0


In [7]:
df_lable1 = df_train["Ethnicity"]

In [8]:
df_lable1

0     -1
1      0
2      0
3      0
4     -1
      ..
395   -1
396    1
397   -1
398   -1
399    0
Name: Ethnicity, Length: 400, dtype: int64

In [9]:
df_train.drop('Ethnicity',axis=1,inplace=True)

In [10]:
df_train

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_ Male,Gender_Female
0,14.891,3606,283,2,34,11,0,1,333,1,0
1,106.025,6645,483,3,82,15,1,1,903,0,1
2,104.593,7075,514,4,71,11,0,0,580,1,0
3,148.924,9504,681,3,36,11,0,0,964,0,1
4,55.882,4897,357,2,68,16,0,1,331,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,12.096,4100,307,3,32,13,0,1,560,1,0
396,13.364,3838,296,5,65,17,0,0,480,1,0
397,57.872,4171,321,5,67,12,0,1,138,0,1
398,37.728,2525,192,1,44,13,0,1,0,1,0


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
from sklearn.svm import SVC
X_train, X_test, Y_train, Y_test = train_test_split(df_train,df_lable1,test_size = 0.2)

In [16]:
model = SVC(kernel = 'poly')
model.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
model.score(X_test,Y_test)

0.5375

In [18]:
import numpy as np
from itertools import combinations_with_replacement
from sympy.core import Mul
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [19]:
def new_features(variables):
    variables=variables.tolist()
    max_degree = 4
    min_degree = 0
    if not variables or max_degree == 0:
        return np.array([1])
    variables = np.array(list(variables) + [1])
    monomials_list_comm = np.empty(0)
    for item in combinations_with_replacement(variables, max_degree):
        powers = dict()
        for variable in variables:
            powers[variable] = 0
        for variable in item:
            if variable != 1:
                powers[variable] += 1
        if max(powers.values()) >= min_degree:
            monomials_list_comm = np.append(monomials_list_comm,Mul(*item))
    return np.array(list(monomials_list_comm))

In [20]:
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [21]:
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))
    
    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw

In [22]:
def sgd(features, outputs):
    max_epochs = 5000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            
            prev_cost = cost
            nth += 1
    return weights

In [23]:
def init3():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    data = pd.read_csv('Creditnew1.csv')

    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[ 0]], axis=1, inplace=True)
    
    print("applying feature engineering...")
    # convert categorical labels to numbers
    diag_map = {'Caucasian': -1.0, 'Asian': -1.0,'African American': 1.0}
    data['Ethnicity'] = data['Ethnicity'].map(diag_map)
    
    # put features & outputs in different data frames
    Y = data.loc[:, 'Ethnicity']
    X = data.iloc[:, 0:-1]
    # filter features
    #remove_correlated_features(X)
    #Xnew = np.apply_along_axis(new_feature, 1, X)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    #X_normalized = MinMaxScaler().fit_transform(Xnew)
    #X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))

In [24]:
regularization_strength = 1000
learning_rate = 0.001

In [25]:
init3()

reading dataset...
applying feature engineering...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 3600410319.017869
Epoch is: 2 and Cost is: 9316216226.788927
Epoch is: 4 and Cost is: 1331155762.5603576
Epoch is: 8 and Cost is: 26437311078.480785
Epoch is: 16 and Cost is: 8053008148.322515
Epoch is: 32 and Cost is: 3002665116.311271
Epoch is: 64 and Cost is: 2826018719.1140966
Epoch is: 128 and Cost is: 2788055903.6052966
Epoch is: 256 and Cost is: 3220421020.2230706
Epoch is: 512 and Cost is: 3607690148.1050215
Epoch is: 1024 and Cost is: 1751197798.8027256
Epoch is: 2048 and Cost is: 1030268986.1046942
Epoch is: 4096 and Cost is: 1783687988.8525465
Epoch is: 4999 and Cost is: 1611789929.7831626
training finished.
weights are: [ 1.55817780e+02 -1.09649790e+03  7.82423389e+01  8.37793013e-01
 -7.70708518e+00 -3.73278546e+01 -6.07541925e+00 -3.09851441e+01
  3.75424082e+02 -1.95982887e+03 -3.89822091e+00]
testing the model...
accuracy on test 