In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('data/train_tfidf_features.csv')



In [3]:

#split training df to train set and test set , 8:2 split

train_df = df.iloc[:int(len(df)*0.8), :]
test_df = df.iloc[int(len(df)*0.8):, :]

X_train, y_train = train_df.drop(columns=["id","label"]),train_df["label"]
X_test, y_test = test_df.drop(columns=["id","label"]),test_df["label"]


counts = train_df["label"].value_counts()
print(counts)



label
0    8507
1    5240
Name: count, dtype: int64


In [6]:

# Create a class obj of Logistic Regression for used in Task 3

class LogisticRegression:
    def __init__(self,seed=0,learning_rate = 0.01,batch_size = 100, epochs = 20,random_state = 0):
        # X --> training data feaatures.
        # y --> training data lable.   
        # bs --> Batch Size.
        # epochs --> Number of iterations.
        # lr --> Learning rate. 
        # size-> number of training examples
        # features-> number of features 
        self.size,self.features = 0,0
        self.y_hat = 0.0
        self.bs  = batch_size
        self.lr = learning_rate
        self.epochs =epochs
        self.class_weights = None 
        self.random = random_state
        
        self.loss = 0

    def sigmoid(self,z):
        return 1.0/(1 + np.exp(-z))
    
    def loss_log(self,y,y_hat,sample_weights):
        #included class weights
        base_LOSS =  -np.mean(sample_weights *  (y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)))
        l1_penalty = self.Lambda * np.sum(np.abs(self.weights)) 
        return base_LOSS + l1_penalty
    
    def gradients(self,X, y, y_hat,sample_weights):
        #included class weights
        m = X.shape[0]
        dw = 1/m * np.dot(X.T, sample_weights*(y_hat - y))+ self.Lambda * np.sign(self.weights) # dw + L1 regularization 
        db = np.mean(sample_weights*(y_hat - y))

        return dw, db   
    
    def train(self,X,y,Lambda=1,class_weights=None,):
        X = X.sample(frac=1, random_state=self.random).reset_index(drop=True)
        y = y.sample(frac=1, random_state=self.random).reset_index(drop=True)
        #Hyperparameter: 
        #Lambda --> L2 regularization 
        #class_weights --> sample weights
        
        self.Lambda = Lambda
        self.class_weights = class_weights
        #Added class weights to offset data class bias
        if self.class_weights is not None:
            sample_weights = np.vectorize(self.class_weights.get)(y.ravel()).reshape(-1, 1)

        else:
            sample_weights = np.ones_like(y)
        self.size,features =X.shape
        #Randomize initials bias and weights
        self.weights =0
        self.bias  =0
        # Reshaping y.
       
        y = y.values.reshape(self.size, 1)
        # Training loop.
        for epoch in range(self.epochs):
            for i in range((self.size-1)//self.bs + 1):
                # Defining batches. SGD.
                start_i = i*self.bs
                end_i = start_i + self.bs
                xb = X[start_i:end_i]
                yb = y[start_i:end_i]
                batch_weights = sample_weights[start_i:end_i]  
                # Calculating hypothesis/prediction.
                self.y_hat = self.sigmoid(np.dot(xb, self.weights) + self.bias)
                #print("Probs:", self.y_hat[:20])
                # Getting the gradients of loss w.r.t parameters.
                dw, db = self.gradients(xb, yb, self.y_hat, batch_weights)

                # Updating the parameters.
                self.weights -= self.lr*dw
                self.bias -= self.lr*db

            # Calculating/update loss log
            self.loss =self.loss_log(y, self.sigmoid(np.dot(X, self.weights) + self.bias), sample_weights)

           

        # returning weights, bias and losses(List).

        return self.weights, self.bias, self.loss
    
    def predict(self,X):
        #input X must be iterable 
        z = np.dot(X, self.weights) + self.bias
        y_hat = self.sigmoid(z)
        return np.round(y_hat)
    

lg = LogisticRegression(batch_size=100, learning_rate=0.9, epochs=30)

lg.train(X_train,y_train)
lg.predict(X_test)


ValueError: operands could not be broadcast together with shapes (100,) (100,5000) 

In [None]:
from sklearn import metrics 

y_pred =lg.predict(X_test)
print(metrics.classification_report(y_test, y_pred))


estimators = [
    ("1",LogisticRegression(batch_size=100, learning_rate=0.9, epochs=30,random_state=1)),
    ("2",LogisticRegression(batch_size=100, learning_rate=0.9, epochs=30,random_state =20)),
    ("3",LogisticRegression(batch_size=100, learning_rate=0.9, epochs=30,random_state=12)),
    ("4",LogisticRegression(batch_size=100, learning_rate=0.9, epochs=30,random_state=9)),
    ("5",LogisticRegression(batch_size=100, learning_rate=0.9, epochs=30,random_state=23)),
    ("6",LogisticRegression(batch_size=100, learning_rate=0.9, epochs=30,random_state=112))
    ]

              precision    recall  f1-score   support

           0       0.74      0.71      0.73      2126
           1       0.56      0.59      0.58      1311

    accuracy                           0.67      3437
   macro avg       0.65      0.65      0.65      3437
weighted avg       0.67      0.67      0.67      3437

