In [181]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

### Decision Tree Classifier

In [184]:
class CustomDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    # Build the decision tree
    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)
        
    # Predict class for each sample in X
    def predict(self, X):
        return np.array([self._classify(x, self.tree) for x in X])

    # Stop conditions: max depth reached or no samples left
    def _build_tree(self, X, y, depth):
        if len(y) == 0 or depth == self.max_depth:
            # If no samples or max depth reached, return the most frequent class
            return np.argmax(np.bincount(y)) if len(y) > 0 else None

        # Find the best feature and threshold for splitting
        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            # If no valid split, return the most frequent class
            return np.argmax(np.bincount(y))

        # Split data into left and right subsets
        left_idx = X[:, best_feature] < best_threshold
        right_idx = ~left_idx

        # Handle empty left or right split
        if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
            return np.argmax(np.bincount(y))  # Return the most frequent class

        left_subtree = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right_subtree = self._build_tree(X[right_idx], y[right_idx], depth + 1)

        # Return the node as a dictionary
        return {'feature': best_feature,
                'threshold': best_threshold,
                'left': left_subtree,
                'right': right_subtree}

    # Iterate through features and thresholds to find the best split
    def _best_split(self, X, y):
        best_gain = -1
        best_feature, best_threshold = None, None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X[:, feature], y, threshold)
                if gain > best_gain:
                    best_gain, best_feature, best_threshold = gain, feature, threshold
        return best_feature, best_threshold

    # Calculate entropy-based information gain for a split
    def _information_gain(self, X_feature, y, threshold):
        parent_entropy = self._entropy(y)
        left_idx = X_feature < threshold
        right_idx = ~left_idx
        if len(y[left_idx]) == 0 or len(y[right_idx]) == 0:
            return 0
        n = len(y)
        n_left, n_right = len(y[left_idx]), len(y[right_idx])
        e_left, e_right = self._entropy(y[left_idx]), self._entropy(y[right_idx])
        child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
        return parent_entropy - child_entropy

    # Calculate entropy of a label distribution
    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        return -np.sum([p * np.log2(p) for p in proportions if p > 0])

    # Traverse the tree to classify a single sample
    def _classify(self, x, tree):
        if not isinstance(tree, dict):
            return tree
        feature, threshold = tree['feature'], tree['threshold']
        if x[feature] < threshold:
            return self._classify(x, tree['left'])
        return self._classify(x, tree['right'])

SyntaxError: expected ':' (2165519512.py, line 6)

In [161]:
# 2. Define Custom Random Forest Classifier
class CustomRandomForest:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        # Train multiple decision trees on random subsets of data
        for _ in range(self.n_estimators):
            idxs = np.random.choice(len(X), size=len(X), replace=True)
            X_sample, y_sample = X[idxs], y[idxs]
            tree = CustomDecisionTree(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Aggregate predictions from all trees
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=tree_predictions)

In [163]:
# 3. Define Custom Naive Bayes Classifier
class CustomNaiveBayes:
    def __init__(self):
        self.class_probs = {}  # Class probabilities
        self.feature_probs = {}  # Feature probabilities for each class

    def fit(self, X, y):
        # Calculate class probabilities P(y)
        self.class_probs = {cls: np.mean(y == cls) for cls in np.unique(y)}
        
        # Calculate feature probabilities P(x|y) for each class
        self.feature_probs = {}
        for cls in np.unique(y):
            X_class = X[y == cls]
            feature_probs_class = []
            for feature_idx in range(X.shape[1]):
                # Calculate mean and std deviation for each feature in each class
                mean = np.mean(X_class[:, feature_idx])
                std = np.std(X_class[:, feature_idx])
                feature_probs_class.append((mean, std))
            self.feature_probs[cls] = feature_probs_class

    def predict(self, X):
        # Predict the class label for each sample in X
        return np.array([self._predict_sample(x) for x in X])

    def _predict_sample(self, x):
        # Calculate the log-probability for each class
        class_scores = {}
        for cls in self.class_probs:
            class_prob = np.log(self.class_probs[cls])  # P(y)
            feature_probs = self.feature_probs[cls]
            # Add the log-likelihood for each feature P(x_i | y)
            for feature_idx, (mean, std) in enumerate(feature_probs):
                feature_prob = self._gaussian_pdf(x[feature_idx], mean, std)
                class_prob += np.log(feature_prob)
            class_scores[cls] = class_prob
        return max(class_scores, key=class_scores.get)

    def _gaussian_pdf(self, x, mean, std):
        # Gaussian Probability Density Function
        return (1 / (std * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((x - mean) ** 2 / std ** 2))

In [165]:
# 4. Load and Prepare Data
df = pd.read_csv("C:/Users/murar/Downloads/Crop_recommendation.csv")

In [167]:
# Separate features (X) and target labels (y)
X = df.drop(columns=["label"]).to_numpy()
y = df["label"]

In [169]:
# Encode string labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [171]:
# Split data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [173]:
# 5. Train and Evaluate Decision Tree Model
dt_model = CustomDecisionTree(max_depth=10)
dt_model.fit(xtrain, ytrain)
dt_predictions = dt_model.predict(xtest)
print("Decision Tree Classification Report:")
print(classification_report(ytest, dt_predictions))
print("Decision Tree Accuracy:", accuracy_score(ytest, dt_predictions))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      0.96      0.98        26
           4       1.00      0.96      0.98        27
           5       1.00      1.00      1.00        17
           6       1.00      1.00      1.00        17
           7       1.00      1.00      1.00        14
           8       0.85      0.96      0.90        23
           9       1.00      0.95      0.97        20
          10       0.92      1.00      0.96        11
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        19
          13       0.96      0.96      0.96        24
          14       1.00      1.00      1.00        19
          15       1.00      1.00      1.00        17
          16       1.00      1.00      1.00 

In [175]:
# 6. Train and Evaluate Random Forest Model
rf_model = CustomRandomForest(n_estimators=10, max_depth=10)
rf_model.fit(xtrain, ytrain)
rf_predictions = rf_model.predict(xtest)
print("Random Forest Classification Report:")
print(classification_report(ytest, rf_predictions))
print("Random Forest Accuracy:", accuracy_score(ytest, rf_predictions))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       1.00      0.90      0.95        20
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        27
           5       0.94      1.00      0.97        17
           6       1.00      1.00      1.00        17
           7       1.00      1.00      1.00        14
           8       0.90      0.83      0.86        23
           9       1.00      0.95      0.97        20
          10       0.92      1.00      0.96        11
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        19
          13       0.88      0.96      0.92        24
          14       1.00      1.00      1.00        19
          15       1.00      1.00      1.00        17
          16       1.00      1.00      1.00 

In [176]:
# 7. Train and Evaluate Naive Bayes Model
nb_model = CustomNaiveBayes()
nb_model.fit(xtrain, ytrain)
nb_predictions = nb_model.predict(xtest)
print("Naive Bayes Classification Report:")
print(classification_report(ytest, nb_predictions))
print("Naive Bayes Accuracy:", accuracy_score(ytest, nb_predictions))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        27
           5       1.00      1.00      1.00        17
           6       1.00      1.00      1.00        17
           7       1.00      1.00      1.00        14
           8       0.92      1.00      0.96        23
           9       1.00      1.00      1.00        20
          10       1.00      1.00      1.00        11
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        19
          13       1.00      1.00      1.00        24
          14       1.00      1.00      1.00        19
          15       1.00      1.00      1.00        17
          16       1.00      1.00      1.00   

In [177]:
# Testing using random data
data = np.array([[104, 18, 30, 23.603016, 60.3, 6.7, 140.91]])
# Make predictions using the Naive Bayes model
prediction = nb_model.predict(data)
predicted_class = label_encoder.inverse_transform([prediction])  # Convert numeric prediction back to original class
print("Predicted Class for the given data:", predicted_class)

Predicted Class for the given data: ['coffee']
