In [18]:
import pandas as pd
import numpy as np
import pickle
from collections import Counter
from math import log
import random

# Load and preprocess data
def preprocess_data(filename):
    df = pd.read_csv(filename)

    # Convert categorical features to numerical codes
    df['Skill'] = df['Skill'].astype('category').cat.codes
    df['Interests'] = df['Interests'].astype('category').cat.codes
    df['Experience_Level'] = df['Experience_Level'].map({'Beginner': 0, 'Intermediate': 1, 'Advanced': 2})

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    return X, y

# Split data
def train_test_split(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    test_len = int(len(X) * test_size)
    test_idx = indices[:test_len]
    train_idx = indices[test_len:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# Accuracy
def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

# ---------------------------------------------
# Naive Bayes (Updated for multiple classes)
class MultinomialNBScratch:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_log_prior = {}
        self.feature_log_prob = {}

        for c in self.classes:
            X_c = X[y == c]
            self.class_log_prior[c] = log(len(X_c) / len(X))

            # Calculate feature probabilities for each class
            feature_counts = np.sum(X_c, axis=0)
            total_counts = np.sum(feature_counts)
            self.feature_log_prob[c] = np.log((feature_counts + 1) / (total_counts + X.shape[1]))

    def predict(self, X):
        preds = []
        for x in X:
            class_scores = {}
            for c in self.classes:
                # Start with the class prior
                score = self.class_log_prior[c]
                # Add the log probability of each feature
                score += np.sum(x * self.feature_log_prob[c])
                class_scores[c] = score
            preds.append(max(class_scores, key=class_scores.get))
        return np.array(preds)

# ---------------------------------------------
# Logistic Regression (Updated for multiple classes using One-vs-Rest)
class LogisticRegressionScratch:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.models = []
        self.classes = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.classes = np.unique(y)

        # One-vs-Rest approach for multi-class classification
        for c in self.classes:
            # Create binary labels for this class
            y_binary = np.where(y == c, 1, 0)

            # Initialize weights
            theta = np.zeros(X.shape[1])

            # Train binary classifier
            for _ in range(self.epochs):
                z = np.dot(X, theta)
                h = self.sigmoid(z)
                gradient = np.dot(X.T, (h - y_binary)) / y.size
                theta -= self.lr * gradient

            self.models.append((c, theta))

    def predict(self, X):
        if not self.models:
            raise ValueError("Model not trained yet")

        # Get probabilities for each class
        probabilities = []
        for c, theta in self.models:
            z = np.dot(X, theta)
            probabilities.append(self.sigmoid(z))

        # Stack probabilities and pick class with highest probability
        prob_matrix = np.column_stack(probabilities)
        return np.array([self.classes[i] for i in np.argmax(prob_matrix, axis=1)])

# ---------------------------------------------
# Decision Tree (Updated for multiple classes)
class DecisionTreeScratch:
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _gini(self, y):
        counts = Counter(y)
        return 1 - sum((c / len(y)) ** 2 for c in counts.values())

    def _best_split(self, X, y):
        best_gain = -1
        best_feat, best_val = None, None
        current_gini = self._gini(y)

        for feature in range(X.shape[1]):
            values = np.unique(X[:, feature])
            for val in values:
                left_idx = X[:, feature] <= val
                right_idx = X[:, feature] > val

                if sum(left_idx) < self.min_samples_split or sum(right_idx) < self.min_samples_split:
                    continue

                left = y[left_idx]
                right = y[right_idx]

                if len(left) == 0 or len(right) == 0:
                    continue

                gain = current_gini - (
                    len(left)/len(y)*self._gini(left) + len(right)/len(y)*self._gini(right))

                if gain > best_gain:
                    best_gain = gain
                    best_feat, best_val = feature, val

        return best_feat, best_val

    def _build_tree(self, X, y, depth=0):
        # Stopping conditions
        if (depth >= self.max_depth or
            len(set(y)) == 1 or
            len(y) < self.min_samples_split):
            return Counter(y).most_common(1)[0][0]

        feature, value = self._best_split(X, y)

        if feature is None:  # No split improves gini
            return Counter(y).most_common(1)[0][0]

        left_idx = X[:, feature] <= value
        right_idx = X[:, feature] > value

        left_branch = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right_branch = self._build_tree(X[right_idx], y[right_idx], depth + 1)

        return (feature, value, left_branch, right_branch)

    def _predict_one(self, x, node):
        if not isinstance(node, tuple):
            return node

        feature, value, left, right = node

        if x[feature] <= value:
            return self._predict_one(x, left)
        else:
            return self._predict_one(x, right)

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])

# ---------------------------------------------
# Random Forest (Updated for multiple classes)
class RandomForestScratch:
    def __init__(self, n_estimators=10, max_depth=5, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_features = X.shape[1]
        self.max_features = int(np.sqrt(n_features)) if self.max_features is None else self.max_features

        for _ in range(self.n_estimators):
            # Bootstrap sample
            idx = np.random.choice(len(X), len(X), replace=True)
            X_sample, y_sample = X[idx], y[idx]

            # Random feature selection
            feature_idx = np.random.choice(n_features, self.max_features, replace=False)
            X_sample = X_sample[:, feature_idx]

            tree = DecisionTreeScratch(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)
            self.trees.append((tree, feature_idx))

    def predict(self, X):
        all_preds = []
        for tree, feature_idx in self.trees:
            X_subset = X[:, feature_idx]
            preds = tree.predict(X_subset)
            all_preds.append(preds)

        # Majority voting
        return np.array([Counter(col).most_common(1)[0][0] for col in zip(*all_preds)])

# ---------------------------------------------
# K-Nearest Neighbors (Updated for multiple classes)
class KNNScratch:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _euclidean(self, a, b):
        return np.sqrt(np.sum((a - b) ** 2))

    def predict(self, X):
        preds = []
        for x in X:
            # Calculate distances to all training points
            distances = [self._euclidean(x, x_train) for x_train in self.X_train]

            # Get indices of k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]

            # Get labels of nearest neighbors
            k_labels = self.y_train[k_indices]

            # Majority vote
            preds.append(Counter(k_labels).most_common(1)[0][0])

        return np.array(preds)

# ---------------------------------------------
# Save model to file
def save_model(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)

# ---------------------------------------------
# Main driver
if __name__ == "__main__":
    # Load and preprocess data
    X, y = preprocess_data("/content/career_data_new.csv")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize models
    models = {
        "naive_bayes": MultinomialNBScratch(),
        "logistic_regression": LogisticRegressionScratch(lr=0.1, epochs=1000),
        "decision_tree": DecisionTreeScratch(max_depth=5),
        "random_forest": RandomForestScratch(n_estimators=20, max_depth=5),
        "knn": KNNScratch(k=5)
    }

    # Train and evaluate models
    results = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[name] = acc
        print(f"{name.replace('_', ' ').title()} Accuracy: {acc:.2f}")
        save_model(model, f"{name}_model.pkl")

    # Print summary
    print("\nModel Performance Summary:")
    for name, acc in results.items():
        print(f"{name.replace('_', ' ').title():<20}: {acc:.2f}")

Training naive_bayes...
Naive Bayes Accuracy: 0.51
Training logistic_regression...
Logistic Regression Accuracy: 0.48
Training decision_tree...
Decision Tree Accuracy: 0.92
Training random_forest...
Random Forest Accuracy: 0.56
Training knn...
Knn Accuracy: 0.99

Model Performance Summary:
Naive Bayes         : 0.51
Logistic Regression : 0.48
Decision Tree       : 0.92
Random Forest       : 0.56
Knn                 : 0.99
