In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

class Logistic_Regression():

  # declaring learning rate & number of iterations (Hyperparametes)
  def __init__(self, learning_rate, iterations):

    self.learning_rate = learning_rate
    self.iterations = iterations

  # fit function to train the model with dataset
  def fit(self, X, Y):

    # number of data points in the dataset (number of rows)  -->  row
    # number of input features in the dataset (number of columns)  --> col
    self.row, self.col = X.shape

    #initiating weight & bias value

    self.w = np.zeros(self.col)

    self.b = 0

    self.X = X

    self.Y = Y

    # implementing Gradient Descent for Optimization

    for i in range(self.iterations):
      self.update_weights()

  def update_weights(self):

    # Y_ formula (sigmoid function)

    Y_ = 1 / (1 + np.exp( - (self.X.dot(self.w) + self.b ) ))

    # derivaties

    dw = (1/self.row)*np.dot(self.X.T, (Y_ - self.Y))

    db = (1/self.row)*np.sum(Y_ - self.Y)

    # updating the weights & bias using gradient descent

    self.w = self.w - self.learning_rate * dw

    self.b = self.b - self.learning_rate * db

  # Sigmoid Equation & Decision Boundary

  def predict(self, X):

    Y_pred = 1 / (1 + np.exp( - (X.dot(self.w) + self.b ) ))
    Y_pred = np.where( Y_pred > 0.5, 1, 0)
    return Y_pred

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo 
# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes)  
X = spambase.data.features 
y = spambase.data.targets 
X.drop(X.columns[[2,4,5,11,18,20,44,48,49,50,51,52,53]],axis= 1,inplace =True) #Droppin the column containing stopwords and special characters.
print(spambase.variables)


                          name     role        type demographic  \
0               word_freq_make  Feature  Continuous        None   
1            word_freq_address  Feature  Continuous        None   
2                word_freq_all  Feature  Continuous        None   
3                 word_freq_3d  Feature  Continuous        None   
4                word_freq_our  Feature  Continuous        None   
5               word_freq_over  Feature  Continuous        None   
6             word_freq_remove  Feature  Continuous        None   
7           word_freq_internet  Feature  Continuous        None   
8              word_freq_order  Feature  Continuous        None   
9               word_freq_mail  Feature  Continuous        None   
10           word_freq_receive  Feature  Continuous        None   
11              word_freq_will  Feature  Continuous        None   
12            word_freq_people  Feature  Continuous        None   
13            word_freq_report  Feature  Continuous        Non

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(X.columns[[2,4,5,11,18,20,44,48,49,50,51,52,53]],axis= 1,inplace =True) #Droppin the column containing stopwords and special characters.


In [5]:
scaler = StandardScaler()
scaler.fit(X)
Standardized = scaler.transform(X)
print (Standardized)

[[-3.42433707e-01  3.30884903e-01 -4.68995838e-02 ... -4.52472762e-02
   4.52979198e-02 -8.72413388e-03]
 [ 3.45359395e-01  5.19091945e-02 -4.68995838e-02 ... -2.44326749e-03
   2.50562832e-01  1.22832407e+00]
 [-1.45921392e-01 -1.65071912e-01 -4.68995838e-02 ...  1.45920848e-01
   2.22110599e+00  3.25873251e+00]
 ...
 [ 6.40127868e-01 -1.65071912e-01 -4.68995838e-02 ... -1.19382054e-01
  -2.36941335e-01 -2.72627750e-01]
 [ 2.80176333e+00 -1.65071912e-01 -4.68995838e-02 ... -1.27482666e-01
  -2.42072958e-01 -3.38603654e-01]
 [-3.42433707e-01 -1.65071912e-01 -4.68995838e-02 ... -1.24236117e-01
  -2.42072958e-01 -4.01280763e-01]]


In [6]:
target = spambase.data.targets['Class']

In [7]:
target.shape

(4601,)

In [8]:
features = Standardized

In [9]:
features.shape

(4601, 44)

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.1,random_state=2)

In [13]:
model = Logistic_Regression( learning_rate = 0.001, iterations = 1000 )
model.fit( X_train, y_train )
y_pred = model.predict(X_test)


In [14]:
from sklearn import metrics
from sklearn.metrics import classification_report

In [36]:
metrics.confusion_matrix(y_test,y_pred)

array([[253,  18],
       [ 33, 157]], dtype=int64)

In [32]:
metrics.f1_score(y_test,y_pred)

0.8602739726027397

In [33]:
metrics.accuracy_score(y_test,y_pred)

0.8893709327548807

In [34]:
metrics.recall_score(y_test,y_pred)

0.8263157894736842

In [35]:
metrics.precision_score(y_test,y_pred)

0.8971428571428571