In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [2]:
df_train = pd.read_csv('adult.csv')

In [3]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=["Total", "Percent"])
missing_data.head(20)

Unnamed: 0,Total,Percent
income,0,0.0
native-country,0,0.0
hours-per-week,0,0.0
capital-loss,0,0.0
capital-gain,0,0.0
gender,0,0.0
race,0,0.0
relationship,0,0.0
occupation,0,0.0
marital-status,0,0.0


In [5]:
df_train.drop(['native-country'], axis=1, inplace=True)
df_train.drop(['marital-status'], axis=1, inplace=True)
df_train.drop(['occupation'], axis=1, inplace=True)

In [6]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,relationship,race,gender,capital-gain,capital-loss,hours-per-week,income
0,25,Private,226802,11th,7,Own-child,Black,Male,0,0,40,<=50K
1,38,Private,89814,HS-grad,9,Husband,White,Male,0,0,50,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Husband,White,Male,0,0,40,>50K
3,44,Private,160323,Some-college,10,Husband,Black,Male,7688,0,40,>50K
4,18,?,103497,Some-college,10,Own-child,White,Female,0,0,30,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Wife,White,Female,0,0,38,<=50K
48838,40,Private,154374,HS-grad,9,Husband,White,Male,0,0,40,>50K
48839,58,Private,151910,HS-grad,9,Unmarried,White,Female,0,0,40,<=50K
48840,22,Private,201490,HS-grad,9,Own-child,White,Male,0,0,20,<=50K


In [7]:
print("income: ",df_train.income.unique())

income:  ['<=50K' '>50K']


In [8]:
df_trainworkclassDummies = pd.get_dummies(df_train['workclass'], prefix = 'workclass')
df_traineducationDummies = pd.get_dummies(df_train['education'], prefix = 'education')

df_trainrelationshipDummies = pd.get_dummies(df_train['relationship'], prefix = 'relationship')
df_trainraceDummies = pd.get_dummies(df_train['race'], prefix = 'race')


In [9]:
df_train.drop(['workclass'], axis=1, inplace=True)

df_train = pd.concat([df_train, df_trainworkclassDummies], axis=1)

df_train.drop(['education'], axis=1, inplace=True)

df_train = pd.concat([df_train, df_traineducationDummies], axis=1)

df_train.drop(['relationship'], axis=1, inplace=True)

df_train = pd.concat([df_train, df_trainrelationshipDummies], axis=1)

df_train.drop(['race'], axis=1, inplace=True)

df_train = pd.concat([df_train, df_trainraceDummies], axis=1)

In [10]:
df_train

Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,income,workclass_?,workclass_Federal-gov,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,25,226802,7,Male,0,0,40,<=50K,0,0,...,0,0,1,0,0,0,0,1,0,0
1,38,89814,9,Male,0,0,50,<=50K,0,0,...,0,0,0,0,0,0,0,0,0,1
2,28,336951,12,Male,0,0,40,>50K,0,0,...,0,0,0,0,0,0,0,0,0,1
3,44,160323,10,Male,7688,0,40,>50K,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,Female,0,0,30,<=50K,1,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,Female,0,0,38,<=50K,0,0,...,0,0,0,0,1,0,0,0,0,1
48838,40,154374,9,Male,0,0,40,>50K,0,0,...,0,0,0,0,0,0,0,0,0,1
48839,58,151910,9,Female,0,0,40,<=50K,0,0,...,0,0,0,1,0,0,0,0,0,1
48840,22,201490,9,Male,0,0,20,<=50K,0,0,...,0,0,1,0,0,0,0,0,0,1


In [11]:
df_train['gender']=df_train['gender'].replace({"Female":1,"Male":0})

In [12]:
df_train

Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,income,workclass_?,workclass_Federal-gov,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,25,226802,7,0,0,0,40,<=50K,0,0,...,0,0,1,0,0,0,0,1,0,0
1,38,89814,9,0,0,0,50,<=50K,0,0,...,0,0,0,0,0,0,0,0,0,1
2,28,336951,12,0,0,0,40,>50K,0,0,...,0,0,0,0,0,0,0,0,0,1
3,44,160323,10,0,7688,0,40,>50K,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,1,0,0,30,<=50K,1,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,1,0,0,38,<=50K,0,0,...,0,0,0,0,1,0,0,0,0,1
48838,40,154374,9,0,0,0,40,>50K,0,0,...,0,0,0,0,0,0,0,0,0,1
48839,58,151910,9,1,0,0,40,<=50K,0,0,...,0,0,0,1,0,0,0,0,0,1
48840,22,201490,9,0,0,0,20,<=50K,0,0,...,0,0,1,0,0,0,0,0,0,1


In [13]:
#save the model to disk

df_train.to_csv('adult1.csv')

In [14]:
df = pd.read_csv('adult1.csv')

In [15]:
df

Unnamed: 0.1,Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,income,workclass_?,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,0,25,226802,7,0,0,0,40,<=50K,0,...,0,0,1,0,0,0,0,1,0,0
1,1,38,89814,9,0,0,0,50,<=50K,0,...,0,0,0,0,0,0,0,0,0,1
2,2,28,336951,12,0,0,0,40,>50K,0,...,0,0,0,0,0,0,0,0,0,1
3,3,44,160323,10,0,7688,0,40,>50K,0,...,0,0,0,0,0,0,0,1,0,0
4,4,18,103497,10,1,0,0,30,<=50K,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48837,27,257302,12,1,0,0,38,<=50K,0,...,0,0,0,0,1,0,0,0,0,1
48838,48838,40,154374,9,0,0,0,40,>50K,0,...,0,0,0,0,0,0,0,0,0,1
48839,48839,58,151910,9,1,0,0,40,<=50K,0,...,0,0,0,1,0,0,0,0,0,1
48840,48840,22,201490,9,0,0,0,20,<=50K,0,...,0,0,1,0,0,0,0,0,0,1


In [16]:
import numpy as np
from itertools import combinations_with_replacement
from sympy.core import Mul
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [17]:
def new_features(variables):
    variables=variables.tolist()
    max_degree = 4
    min_degree = 0
    if not variables or max_degree == 0:
        return np.array([1])
    variables = np.array(list(variables) + [1])
    monomials_list_comm = np.empty(0)
    for item in combinations_with_replacement(variables, max_degree):
        powers = dict()
        for variable in variables:
            powers[variable] = 0
        for variable in item:
            if variable != 1:
                powers[variable] += 1
        if max(powers.values()) >= min_degree:
            monomials_list_comm = np.append(monomials_list_comm,Mul(*item))
    return np.array(list(monomials_list_comm))

In [18]:
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [19]:
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))
    
    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw

In [20]:
def sgd(features, outputs):
    max_epochs = 2000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            
            prev_cost = cost
            nth += 1
    return weights

In [21]:
def init3():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    data = pd.read_csv('adult1.csv')
    print(data.head())

    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[0]], axis=1, inplace=True)
    
    
    print("applying feature engineering...")
    # convert categorical labels to numbers
    diag_map = {'<=50K': -1.0, '>50K': 1.0}
    data['income'] = data['income'].map(diag_map)
    
    # put features & outputs in different data frames
    Y = data.loc[:, 'income']
    X = data.iloc[:, 0:-1]
    # filter features
    #remove_correlated_features(X)
    #Xnew = np.apply_along_axis(new_features, 1, X)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    #X_normalized = MinMaxScaler().fit_transform(Xnew)
    #X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    print(W.shape)
    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))

In [22]:
regularization_strength = 1000
learning_rate = 0.001

In [24]:
import time
start = time.time()
init3()
end = time.time()
print("TIME TAKEN: {} \n".format(end-start))

reading dataset...
   Unnamed: 0  age  fnlwgt  educational-num  gender  capital-gain  \
0           0   25  226802                7       0             0   
1           1   38   89814                9       0             0   
2           2   28  336951               12       0             0   
3           3   44  160323               10       0          7688   
4           4   18  103497               10       1             0   

   capital-loss  hours-per-week income  workclass_?  ...  \
0             0              40  <=50K            0  ...   
1             0              50  <=50K            0  ...   
2             0              40   >50K            0  ...   
3             0              40   >50K            0  ...   
4             0              30  <=50K            1  ...   

   relationship_Not-in-family  relationship_Other-relative  \
0                           0                            0   
1                           0                            0   
2                  

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [2]:
df_train = pd.read_csv('adult1.csv')

In [3]:
df_train

Unnamed: 0.1,Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,income,workclass_?,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,0,25,226802,7,0,0,0,40,<=50K,0,...,0,0,1,0,0,0,0,1,0,0
1,1,38,89814,9,0,0,0,50,<=50K,0,...,0,0,0,0,0,0,0,0,0,1
2,2,28,336951,12,0,0,0,40,>50K,0,...,0,0,0,0,0,0,0,0,0,1
3,3,44,160323,10,0,7688,0,40,>50K,0,...,0,0,0,0,0,0,0,1,0,0
4,4,18,103497,10,1,0,0,30,<=50K,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48837,27,257302,12,1,0,0,38,<=50K,0,...,0,0,0,0,1,0,0,0,0,1
48838,48838,40,154374,9,0,0,0,40,>50K,0,...,0,0,0,0,0,0,0,0,0,1
48839,48839,58,151910,9,1,0,0,40,<=50K,0,...,0,0,0,1,0,0,0,0,0,1
48840,48840,22,201490,9,0,0,0,20,<=50K,0,...,0,0,1,0,0,0,0,0,0,1


In [4]:
# drop last column (extra column added by pd)
    # and unnecessary first column (id)
df_train.drop(df_train.columns[[0]], axis=1, inplace=True)
    
    
print("applying feature engineering...")
    # convert categorical labels to numbers
diag_map = {'<=50K': -1.0, '>50K': 1.0}
df_train['income'] = df_train['income'].map(diag_map)
    
    

applying feature engineering...


In [5]:
df_lable1 = df_train["income"]

In [6]:
df_lable1

0       -1.0
1       -1.0
2        1.0
3        1.0
4       -1.0
        ... 
48837   -1.0
48838    1.0
48839   -1.0
48840   -1.0
48841    1.0
Name: income, Length: 48842, dtype: float64

In [7]:
df_train.drop('income',axis=1,inplace=True)

In [8]:
df_train

Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,25,226802,7,0,0,0,40,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,38,89814,9,0,0,0,50,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,28,336951,12,0,0,0,40,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,44,160323,10,0,7688,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,1,0,0,30,1,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,1,0,0,38,0,0,0,...,0,0,0,0,1,0,0,0,0,1
48838,40,154374,9,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,1
48839,58,151910,9,1,0,0,40,0,0,0,...,0,0,0,1,0,0,0,0,0,1
48840,22,201490,9,0,0,0,20,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [9]:
from sklearn.model_selection import train_test_split 

In [10]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(df_train,df_lable1,test_size = 0.5)

In [None]:
model = SVC(kernel = 'poly')
model.fit(X_train, y_train)

In [14]:
model.score(X_test,y_test)

0.7986568936570984

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [2]:
def lvec(a):
    n = a.shape[0]
    s = np.empty(0)
    for i in range(n):
        temp = a[i]*a[i:]
        temp[0] = temp[0]/2
        s = np.append(s,temp)
    return s

In [3]:
def create_z(x):
    s = lvec(x)
    z = np.empty(0)
    z = np.append(z,s)
    z = np.append(z,x)
    z = np.append(z,1)
    return z

In [4]:
def create_r(x):
    z = create_z(x)
    eta = lvec(z)
    s = lvec(x)
    r = np.empty(0)
    r = np.append(r,eta)
    r = np.append(r,s)
    return r

In [5]:
def new_feature(x):
    r = create_r(x)
    xnew = np.empty(0)
    xnew = np.append(xnew,r)
    xnew = np.append(xnew,x)
    xnew = np.append(xnew,1)
    return xnew

In [6]:
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped


def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

In [7]:
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [8]:
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw

In [9]:
def sgd(features, outputs):
    max_epochs = 2000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights

In [10]:
def init3():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    data = pd.read_csv('adult1.csv')
    print(data.head())

    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[0]], axis=1, inplace=True)
    
    
    print("applying feature engineering...")
    # convert categorical labels to numbers
    diag_map = {'<=50K': -1.0, '>50K': 1.0}
    data['income'] = data['income'].map(diag_map)
    
    # put features & outputs in different data frames
    Y = data.loc[:, 'income']
    X = data.iloc[:, 0:-1]
    # filter features
    #remove_correlated_features(X)
    #Xnew = np.apply_along_axis(new_features, 1, X)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    #X_normalized = MinMaxScaler().fit_transform(Xnew)
    #X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    print(W.shape)
    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))

In [11]:
regularization_strength = 1000
learning_rate = 0.001

In [None]:
import time
start = time.time()
init3()
end = time.time()
print("TIME TAKEN: {} \n".format(end-start))

reading dataset...
   Unnamed: 0  age  fnlwgt  educational-num  gender  capital-gain  \
0           0   25  226802                7       0             0   
1           1   38   89814                9       0             0   
2           2   28  336951               12       0             0   
3           3   44  160323               10       0          7688   
4           4   18  103497               10       1             0   

   capital-loss  hours-per-week income  workclass_?  ...  \
0             0              40  <=50K            0  ...   
1             0              50  <=50K            0  ...   
2             0              40   >50K            0  ...   
3             0              40   >50K            0  ...   
4             0              30  <=50K            1  ...   

   relationship_Not-in-family  relationship_Other-relative  \
0                           0                            0   
1                           0                            0   
2                  