# Lab 2 - Probability in Machine Learning

## Part 1: Coin Flip Probability Example

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Simulating 1000 coin flips, 0 for 'tails' and 1 for 'heads'
coin_flips = np.random.choice(['heads', 'tails'], size=1000)
df_coin = pd.DataFrame({'flip_result': coin_flips})

In [3]:
flip_counts = df_coin['flip_result'].value_counts()
print(flip_counts)

flip_result
tails    513
heads    487
Name: count, dtype: int64


In [4]:
p_heads = flip_counts['heads'] / len(df_coin)
p_tails = flip_counts['tails'] / len(df_coin)
print(f"Probability of Heads: {p_heads}")
print(f"Probability of Tails: {p_tails}")

Probability of Heads: 0.487
Probability of Tails: 0.513


## Part 2: Bayesian Email Classifier

In [5]:
# The following code snippet creates a simulated email classification (spam and not spam) dataset with 1000 data points.

import pandas as pd
import numpy as np

# Sample size
n_samples = 1000

# Simulating data
np.random.seed(42)
data = {
    'email_length': np.random.normal(100, 20, n_samples).astype(int),
    'contains_free': np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3]),
    'contains_winner': np.random.choice([0, 1], size=n_samples, p=[0.8, 0.2]),
    'time_of_day': np.random.choice(['morning', 'afternoon', 'evening', 'night'], n_samples),
    'label': np.random.choice(['spam', 'ham'], n_samples, p=[0.4, 0.6])
}

df = pd.DataFrame(data)

# Saving the dataset
df.to_csv('simulated_email_dataset.csv', index=False)

In [7]:
df_emails = pd.read_csv('simulated_email_dataset.csv')
df_emails.head()

Unnamed: 0,email_length,contains_free,contains_winner,time_of_day,label
0,109,0,0,morning,ham
1,97,0,0,morning,spam
2,112,0,0,morning,spam
3,130,1,0,afternoon,ham
4,95,0,1,afternoon,spam


## Task 2: Data Preprocessing

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate features and target variable
X = df_emails.drop('label', axis=1)
y = df_emails['label']

# Define numerical and categorical features
numerical_features = ['email_length']
categorical_features = ['contains_free', 'contains_winner', 'time_of_day']

# Create transformers for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the data
X_preprocessed = preprocessor.fit_transform(X)

# Display the preprocessed data
pd.DataFrame(X_preprocessed, columns=numerical_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))

Unnamed: 0,email_length,contains_free_1,contains_winner_1,time_of_day_evening,time_of_day_morning,time_of_day_night
0,0.465685,0.0,0.0,0.0,1.0,0.0
1,-0.146723,0.0,0.0,0.0,1.0,0.0
2,0.618787,0.0,0.0,0.0,1.0,0.0
3,1.537399,1.0,0.0,0.0,0.0,0.0
4,-0.248791,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
995,-0.299825,0.0,1.0,0.0,0.0,1.0
996,1.792569,0.0,0.0,0.0,0.0,1.0
997,0.618787,0.0,0.0,1.0,0.0,0.0
998,-0.606029,0.0,1.0,0.0,0.0,0.0


## Task 3: Probability Calculation

In [10]:
label_counts = df_emails['label'].value_counts(normalize=True)

print("Probability of Spam Emails:", label_counts['spam'])
print("Probability of Ham Emails:", label_counts['ham'])

Probability of Spam Emails: 0.409
Probability of Ham Emails: 0.591


## Task 4: Implementing Bayes' Theorem

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numerical_features = ['email_length']
categorical_features = ['contains_free', 'contains_winner', 'time_of_day']

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_preprocessed = preprocessor.fit_transform(X)

## Task 5: Model Testing

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = df_emails.drop('label', axis=1)
y = df_emails['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the preprocessor on the training set
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Transform the testing set
X_test_preprocessed = preprocessor.transform(X_test)

# Load and train a logistic regression model on the training set
model = LogisticRegression(random_state=42)
model.fit(X_train_preprocessed, y_train)

# Predict labels on the testing set
y_pred = model.predict(X_test_preprocessed)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.61
Classification Report:
               precision    recall  f1-score   support

         ham       0.61      1.00      0.76       122
        spam       0.00      0.00      0.00        78

    accuracy                           0.61       200
   macro avg       0.30      0.50      0.38       200
weighted avg       0.37      0.61      0.46       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Task 6: Discussion

1. Probability Distribution for Email Classifier:
I would choose the Bernoulli distribution for an email classifier. This distribution is appropriate for binary outcomes, such as spam or not spam, aligning with features like contains_free and contains_winner. For continuous features like email_length, a Gaussian (normal) distribution could be suitable.


2. Bayesian Updating and Improved Accuracy:
Benefit: Bayesian updating continuously refines the model with new evidence, adapting to evolving patterns in spam emails.
How it works: The model's initial probabilities are updated as new emails are received. Bayesian updating allows the model to incorporate new information, adjust its predictions, and improve accuracy over time.
Flexibility: This approach is particularly useful in dynamic environments where spam characteristics may change, ensuring the model remains effective.


3. Limitations of the Model:
Simplistic Features: The model relies on relatively simple features like email length and keyword indicators.
Limited Sample Size: The dataset used for training and testing is small; a larger and more diverse dataset would enhance model robustness.
Static Features: The model does not dynamically adapt to changes in spam characteristics over time.
Assumption of Independence: The model assumes independence between features, which might not hold true in real-world scenarios.
Lack of Evaluation Metrics: The model's performance is assessed only based on accuracy; additional metrics like precision, recall, and F1-score would provide a more comprehensive evaluation.