# **Important Libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# **Load and Preprocess the dataset**

In [2]:
df = pd.read_csv("/content/train.csv")
df.head()

Unnamed: 0,message_id,num_links,num_words,has_offer,sender_score,all_caps,is_spam
0,1,3,98,1,0.718607,0,0
1,2,0,170,0,0.698901,1,0
2,3,0,38,0,0.620466,0,0
3,4,0,116,0,0.701755,0,0
4,5,3,89,1,0.583621,1,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19100 entries, 0 to 19099
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   message_id    19100 non-null  int64  
 1   num_links     19100 non-null  int64  
 2   num_words     19100 non-null  int64  
 3   has_offer     19100 non-null  int64  
 4   sender_score  19100 non-null  float64
 5   all_caps      19100 non-null  int64  
 6   is_spam       19100 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 1.0 MB


In [4]:
# Check for null values
df.isna().sum()

Unnamed: 0,0
message_id,0
num_links,0
num_words,0
has_offer,0
sender_score,0
all_caps,0
is_spam,0


In [5]:
# Check for duplicates
df.duplicated().sum()

np.int64(0)

In [6]:
df['is_spam'].value_counts() # Dataset is imbalanced

Unnamed: 0_level_0,count
is_spam,Unnamed: 1_level_1
0,17354
1,1746


In [7]:
# Drop unnecessary columns
df.drop(['message_id'], axis=1,inplace=True)

In [8]:
# Feature Engineering
df['links_per_word'] = df['num_links'] / (df['num_words'] + 1)
df.head()

Unnamed: 0,num_links,num_words,has_offer,sender_score,all_caps,is_spam,links_per_word
0,3,98,1,0.718607,0,0,0.030303
1,0,170,0,0.698901,1,0,0.0
2,0,38,0,0.620466,0,0,0.0
3,0,116,0,0.701755,0,0,0.0
4,3,89,1,0.583621,1,1,0.033333


In [9]:
df.describe()

Unnamed: 0,num_links,num_words,has_offer,sender_score,all_caps,is_spam,links_per_word
count,19100.0,19100.0,19100.0,19100.0,19100.0,19100.0,19100.0
mean,1.498272,109.65178,0.303298,0.694174,0.097853,0.091414,0.018992
std,1.221115,51.974463,0.459695,0.188285,0.297124,0.288204,0.023753
min,0.0,20.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,65.0,0.0,0.56699,0.0,0.0,0.005435
50%,1.0,110.0,0.0,0.699204,0.0,0.0,0.012346
75%,2.0,155.0,1.0,0.8342,0.0,0.0,0.02439
max,9.0,199.0,1.0,1.0,1.0,1.0,0.318182


In [10]:
df.corr()

Unnamed: 0,num_links,num_words,has_offer,sender_score,all_caps,is_spam,links_per_word
num_links,1.0,0.003194,-0.004057,-0.002424,0.000682,0.322846,0.657699
num_words,0.003194,1.0,-0.005518,0.004276,0.001451,0.000849,-0.488613
has_offer,-0.004057,-0.005518,1.0,-0.002169,-0.001098,0.368503,-0.006697
sender_score,-0.002424,0.004276,-0.002169,1.0,0.018184,-0.08821,-0.007697
all_caps,0.000682,0.001451,-0.001098,0.018184,1.0,0.207979,-0.002816
is_spam,0.322846,0.000849,0.368503,-0.08821,0.207979,1.0,0.210271
links_per_word,0.657699,-0.488613,-0.006697,-0.007697,-0.002816,0.210271,1.0


In [11]:
X = df.drop(['is_spam'], axis=1).values
y = df['is_spam'].values

In [12]:
X

array([[3.00000000e+00, 9.80000000e+01, 1.00000000e+00, 7.18607000e-01,
        0.00000000e+00, 3.03030303e-02],
       [0.00000000e+00, 1.70000000e+02, 0.00000000e+00, 6.98901226e-01,
        1.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.80000000e+01, 0.00000000e+00, 6.20465534e-01,
        0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 1.45000000e+02, 0.00000000e+00, 6.00568655e-01,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.78000000e+02, 0.00000000e+00, 6.75468409e-01,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.00000000e+01, 0.00000000e+00, 6.97732338e-01,
        0.00000000e+00, 0.00000000e+00]])

In [13]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
# Stratified train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [15]:
# Verify stratification
print("Training set class distribution:", np.bincount(y_train))
print("Validation set class distribution:", np.bincount(y_val))

Training set class distribution: [13883  1397]
Validation set class distribution: [3471  349]


In [16]:
scaler = scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# **Logistic Regression**

In [20]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iterations=1000, lambda_reg=0.1,
                 reg_type='l2', class_weight={0: 1.5, 1: 15}):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.lambda_reg = lambda_reg
        self.reg_type = reg_type.lower()
        self.class_weight = class_weight
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        """Sigmoid activation with clipping to avoid overflow."""
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def log_loss(self, y_true, y_pred):
        """Binary cross-entropy loss with regularization."""
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

        # Add regularization penalty
        if self.reg_type == 'l1':
            reg_term = self.lambda_reg * np.sum(np.abs(self.weights))
        elif self.reg_type == 'l2':
            reg_term = self.lambda_reg * np.sum(self.weights ** 2) / 2
        else:
            raise ValueError("reg_type must be 'l1' or 'l2'")

        return loss + reg_term

    def _compute_sample_weights(self, y):
        if self.class_weight == 'balanced':
            # Inverse frequency weighting
            classes, counts = np.unique(y, return_counts=True)
            total = len(y)
            weight_dict = {cls: total / (2 * count) for cls, count in zip(classes, counts)}
        elif isinstance(self.class_weight, dict):
            # Manual class weights
            weight_dict = self.class_weight
        else:
            # No weighting
            return np.ones_like(y)

        return np.array([weight_dict[label] for label in y])

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        sample_weights = self._compute_sample_weights(y)

        for _ in range(self.max_iterations):
            # Forward pass
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_model)

            # Compute weighted error
            error = y_pred - y
            weighted_error = error * sample_weights

            # Regularization gradient
            if self.reg_type == 'l1':
                reg_grad = self.lambda_reg * np.sign(self.weights)
            elif self.reg_type == 'l2':
                reg_grad = self.lambda_reg * self.weights
            else:
                raise ValueError("reg_type must be 'l1' or 'l2'")

            # Gradients
            dw = (1 / n_samples) * np.dot(X.T, weighted_error) + (reg_grad / n_samples)
            db = (1 / n_samples) * np.sum(weighted_error)

            # Parameter update
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict_proba(self, X):
      """Predict probabilities for input samples."""
      return self.sigmoid(np.dot(X, self.weights) + self.bias)

    def predict(self, X, threshold=0.5):
      """Predict binary class labels based on a threshold."""
      return (self.predict_proba(X) >= threshold).astype(int)


In [21]:
def grid_search(X_train, y_train, X_val, y_val, param_grid):
    best_log_loss = float('inf')
    best_params = None
    best_model = None

    # Try every combination of parameters
    for lr in param_grid['learning_rate']:
        for lam in param_grid['lambda_reg']:
            for max_iter in param_grid['max_iterations']:
                for reg_type in param_grid['reg_type']:
                    for class_weight in param_grid['class_weights']:

                        # Create model with given hyperparameters
                        model = LogisticRegression(
                            learning_rate=lr,
                            max_iterations=max_iter,
                            lambda_reg=lam,
                            reg_type=reg_type,
                            class_weight = class_weight
                        )

                        # Train model
                        model.fit(X_train, y_train)

                        # Evaluate on validation set
                        y_pred_proba = model.predict_proba(X_val)
                        log_loss = model.log_loss(y_val, y_pred_proba)
                        print(f"LR: {lr}, Lambda: {lam}, MaxIter: {max_iter}, Reg: {reg_type}, Class Weight: {class_weight} ,LogLoss: {log_loss:.4f}")

                        # Track best model
                        if log_loss < best_log_loss:
                            best_log_loss = log_loss
                            best_params = {
                                'learning_rate': lr,
                                'lambda_reg': lam,
                                'max_iterations': max_iter,
                                'reg_type': reg_type,
                                'class_weight': class_weight
                            }
                            best_model = model

    return best_model, best_params, best_log_loss

In [22]:
param_grid = {
    'learning_rate': [ 0.03, 0.05, 0.01, 0.1,0.2],
    'lambda_reg': [ 0.0001, 0.001, 0.01],
    'max_iterations': [3000, 4000, 5000, 6000],
    'reg_type': ['L1','L2'],
    'class_weights': [ None, 'balanced', {0: 3, 1: 6}]
}

best_model, best_params, best_log_loss = grid_search(
    X_train_scaled, y_train, X_val_scaled, y_val, param_grid
)

print("\n Best Parameters:", best_params)
print(f" Best Validation LogLoss: {best_log_loss:.4f}")

LR: 0.03, Lambda: 0.0001, MaxIter: 3000, Reg: L1, Class Weight: None ,LogLoss: 0.1658
LR: 0.03, Lambda: 0.0001, MaxIter: 3000, Reg: L1, Class Weight: balanced ,LogLoss: 0.3167
LR: 0.03, Lambda: 0.0001, MaxIter: 3000, Reg: L1, Class Weight: {0: 3, 1: 6} ,LogLoss: 0.1732
LR: 0.03, Lambda: 0.0001, MaxIter: 3000, Reg: L2, Class Weight: None ,LogLoss: 0.1657
LR: 0.03, Lambda: 0.0001, MaxIter: 3000, Reg: L2, Class Weight: balanced ,LogLoss: 0.3165
LR: 0.03, Lambda: 0.0001, MaxIter: 3000, Reg: L2, Class Weight: {0: 3, 1: 6} ,LogLoss: 0.1731
LR: 0.03, Lambda: 0.0001, MaxIter: 4000, Reg: L1, Class Weight: None ,LogLoss: 0.1637
LR: 0.03, Lambda: 0.0001, MaxIter: 4000, Reg: L1, Class Weight: balanced ,LogLoss: 0.3124
LR: 0.03, Lambda: 0.0001, MaxIter: 4000, Reg: L1, Class Weight: {0: 3, 1: 6} ,LogLoss: 0.1730
LR: 0.03, Lambda: 0.0001, MaxIter: 4000, Reg: L2, Class Weight: None ,LogLoss: 0.1635
LR: 0.03, Lambda: 0.0001, MaxIter: 4000, Reg: L2, Class Weight: balanced ,LogLoss: 0.3122
LR: 0.03, Lamb

In [23]:
best_model.fit(X_train_scaled, y_train)

In [24]:
y_val_pred = best_model.predict_proba(X_val_scaled)
log_loss = best_model.log_loss(y_val, y_val_pred)

print(f"Log Loss:  {log_loss:.4f}")

Log Loss:  0.1614


In [25]:
test = pd.read_csv("/content/test.csv")
test.head()

Unnamed: 0,message_id,num_links,num_words,has_offer,sender_score,all_caps
0,20000,1,37,0,0.633935,0
1,20001,0,174,0,0.577815,0
2,20002,0,116,0,0.396098,0
3,20003,0,112,0,0.64645,0
4,20004,2,92,1,0.947398,1


In [26]:
features = test.drop(['message_id'],axis=1)
ids = test['message_id']
ids

Unnamed: 0,message_id
0,20000
1,20001
2,20002
3,20003
4,20004
...,...
895,20895
896,20896
897,20897
898,20898


In [27]:
features['links_per_word'] = features['num_links'] / (features['num_words'] + 1)

In [28]:
features_scaled = scaler.transform(features)



In [29]:
features_scaled

array([[-0.40651901, -1.40094477, -0.65955384, -0.32096753, -0.33078237,
         0.31583616],
       [-1.22512872,  1.23959789, -0.65955384, -0.62052284, -0.33078237,
        -0.80432018],
       [-1.22512872,  0.12170392, -0.65955384, -1.59048397, -0.33078237,
        -0.80432018],
       ...,
       [-0.40651901,  0.73847301,  1.51617645,  0.26417161, -0.33078237,
        -0.51864273],
       [ 0.4120907 ,  1.23959789, -0.65955384, -0.60219107,  3.02313574,
        -0.31785228],
       [ 0.4120907 , -0.398695  , -0.65955384,  1.37254115, -0.33078237,
         0.14158962]])

In [30]:
predictions = best_model.predict_proba(features_scaled)
predictions

array([1.67184600e-03, 5.76400249e-04, 9.10285488e-04, 4.55581945e-04,
       7.33886864e-01, 5.81763304e-04, 1.08321071e-03, 3.27108521e-03,
       2.13526680e-02, 6.97813683e-03, 3.09763133e-04, 5.34817725e-02,
       5.32368025e-03, 4.38913605e-03, 2.08873374e-02, 5.17182507e-03,
       4.48608147e-01, 4.04147829e-03, 3.75727925e-03, 1.52411750e-03,
       1.61907346e-03, 2.18654416e-02, 5.05042560e-01, 2.42287669e-02,
       7.72869945e-02, 3.67144954e-04, 4.45609178e-02, 1.38246155e-01,
       5.31307662e-01, 9.30610609e-03, 3.39514637e-03, 4.48481375e-04,
       1.65710660e-02, 5.80056447e-03, 1.32008089e-03, 7.30141604e-01,
       3.55499171e-04, 7.37651918e-03, 2.86654631e-03, 3.22174224e-04,
       2.38542983e-02, 1.65531564e-03, 1.71178116e-04, 4.23912969e-04,
       9.19761082e-01, 9.68935545e-03, 7.97952550e-01, 3.90560690e-02,
       4.90189819e-04, 9.20805857e-03, 1.41132160e-03, 1.87377305e-01,
       3.05056975e-04, 3.69629644e-03, 3.13068486e-04, 5.42043697e-02,
      

In [31]:
submission = pd.DataFrame({
    'message_id': ids,
    'Calories': predictions
})
submission.head()

Unnamed: 0,message_id,Calories
0,20000,0.001672
1,20001,0.000576
2,20002,0.00091
3,20003,0.000456
4,20004,0.733887


In [32]:
submission.to_csv('submission1.csv', index=False)