Import Packages :

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.preprocessing import StandardScaler
import joblib

Dataset Load :

In [3]:
# Load dataset
df = pd.read_csv("data.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Rename relevant columns
df = df.rename(columns={
    '2. How interested are you in the event topic?': 'interest',
    '3. How close are you to the event location?': 'proximity',
    '4. How many similar events have you attended in the past year?': 'past_attendance',
    '5. How much free time do you have during the event timing?': 'free_time',
    '6. Are you willing to attend this event?': 'willingness'
})

df.head()


Unnamed: 0,Timestamp,1. Name,interest,proximity,past_attendance,free_time,willingness
0,6/21/2025 9:52:30,Luffy,2,1,5,2,Yes
1,6/21/2025 9:53:07,S.Shalini,2,2,1,1,Yes
2,6/21/2025 9:53:12,Anachi,2,1,100,2,Yes
3,6/21/2025 9:53:57,Madhumitha,2,1,2,2,Yes
4,6/21/2025 9:54:16,Anantha Krishnan R,2,1,2,2,Yes


Dataset Preprocess :

In [4]:
# Normalize values from linear scale 1–2 → 0–1
df[['interest', 'proximity', 'free_time']] = df[['interest', 'proximity', 'free_time']].apply(lambda x: (x - 1) / (2 - 1))

# Convert willingness to numerical values
df['willingness'] = df['willingness'].map({'Yes': 1, 'No': 0})

# Features and labels
X = df[['interest', 'proximity', 'past_attendance', 'free_time']].values
y = df['willingness'].values


Dataset into unlabeled :

In [5]:
# Unlabeled samples = -1
y_semi = np.array([label if not np.isnan(label) else -1 for label in y])

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Train the model :

In [9]:
# Base model
base_model = LogisticRegression()

# Self-Training Wrapper
self_training_model = SelfTrainingClassifier(base_model, criterion='k_best', k_best=3, max_iter=10)

# Train on partially labeled data
self_training_model.fit(X_scaled, y_semi)

print("✅ Self-training complete!")

✅ Self-training complete!


Prediction :

In [10]:
# Predict on all samples
predicted = self_training_model.predict(X_scaled)

# Add predictions to DataFrame
df['predicted_attendance'] = predicted

# Save model and predictions
joblib.dump(self_training_model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
df.to_csv("predictions.csv", index=False)

print("✅ Model and predictions saved as model.pkl and predictions.csv")

✅ Model and predictions saved as model.pkl and predictions.csv


Test the model:

In [12]:
# Load saved model and scaler
model = joblib.load("model.pkl")
scaler = joblib.load("scaler.pkl")

# 👇 Define your test input
# Format: [interest (0-1), proximity (0-1), past_attendance (integer), free_time (0-1)]
test_input = np.array([[0.0, 0.0, 0, 0.0]])
  # You can change these values

# Scale input the same way training data was scaled
test_scaled = scaler.transform(test_input)

# Make prediction
prediction = model.predict(test_scaled)[0]

# Show result
if prediction == 1:
    print("✅ The person is likely to ATTEND the event.")
else:
    print("❌ The person is NOT likely to attend the event.")

✅ The person is likely to ATTEND the event.


Accuracy, Precision, Recall , F1(for classification):

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Filter only rows with actual labels (i.e., labeled data)
labeled_mask = y_semi != -1
X_labeled = X_scaled[labeled_mask]
y_true = y_semi[labeled_mask]
y_pred = self_training_model.predict(X_labeled)

# Classification Metrics
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Display results
print(f"📊 Model Evaluation on Labeled Data:")
print(f"✅ Accuracy:  {acc:.4f}")
print(f"✅ Precision: {prec:.4f}")
print(f"✅ Recall:    {rec:.4f}")
print(f"✅ F1 Score:  {f1:.4f}")

📊 Model Evaluation on Labeled Data:
✅ Accuracy:  0.7179
✅ Precision: 0.7179
✅ Recall:    1.0000
✅ F1 Score:  0.8358
