In [1]:
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import pandas as pd
import numpy as np

data = pd.read_excel('dataset.xlsx')

# Extract the relevant columns (E to J)
X = data[['Header_and_Main_Declaration', 'Incomprehensible_Code',
          'Comprehensible_Code_with_logical_errors',
          'Comprehensible_code_with_syntax_errors',
          'Correct_code_and_output']]

y = data['Final_Marks']

# Handle any missing values if they exist
X.fillna(0, inplace=True)

# Splitting the data into training and testing sets
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.3, random_state=42)

# Defining hyperparameters
perceptron = Perceptron()
paramDistPerceptron = {
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': np.logspace(-5, 1, 10),
    'max_iter': [500, 1000, 1500, 2000],
    'tol': [1e-3, 1e-4, 1e-5],
}

# Perform RandomizedSearchCV on Perceptron
randomSearchPerceptron = RandomizedSearchCV(perceptron, param_distributions=paramDistPerceptron,
                                            n_iter=10, cv=5, random_state=42, n_jobs=-1)
randomSearchPerceptron.fit(xTrain, yTrain)

# Best hyperparameters for Perceptron
print("Best Perceptron Hyperparameters:", randomSearchPerceptron.best_params_)

# Define MLPClassifier model and hyperparameters
mlp = MLPClassifier()
paramDistMlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': np.logspace(-5, 1, 10),
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 300, 500]
}

# Perform RandomizedSearchCV on MLPClassifier
randomSearchMlp = RandomizedSearchCV(mlp, param_distributions=paramDistMlp,
                                     n_iter=10, cv=5, random_state=42, n_jobs=-1)
randomSearchMlp.fit(xTrain, yTrain)

# Best hyperparameters for MLP
print("Best MLP Hyperparameters:", randomSearchMlp.best_params_)

# Evaluate models on test set
from sklearn.metrics import accuracy_score

yPredPerceptron = randomSearchPerceptron.predict(xTest)
yPredMlp = randomSearchMlp.predict(xTest)

# Print accuracy
print("Perceptron Accuracy:", accuracy_score(yTest, yPredPerceptron))
print("MLP Accuracy:", accuracy_score(yTest, yPredMlp))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


Best Perceptron Hyperparameters: {'tol': 1e-05, 'penalty': 'l2', 'max_iter': 1500, 'alpha': 4.641588833612782e-05}
Best MLP Hyperparameters: {'solver': 'adam', 'max_iter': 500, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (150, 100, 50), 'alpha': 0.1, 'activation': 'tanh'}
Perceptron Accuracy: 0.3342776203966006
MLP Accuracy: 1.0




In [3]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


classifiers = {
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'Naive Bayes': GaussianNB()
}
# Initialize a DataFrame to store results
results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1Score'])

# Loop over classifiers, fit and predict, and store results
for classifierName, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(xTrain, yTrain)

    # Predict on the test set
    yPred = classifier.predict(xTest)

    # Calculate performance metrics
    accuracy = accuracy_score(yTest, yPred)
    precision = precision_score(yTest, yPred, average='weighted', zero_division=0)
    recall = recall_score(yTest, yPred, average='weighted', zero_division=0)
    f1 = f1_score(yTest, yPred, average='weighted', zero_division=0)

    # Create a temporary DataFrame for this result
    tempResult = pd.DataFrame({
        'Classifier': [classifierName],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1Score': [f1]
    })

    # Append the result to the results DataFrame using pd.concat
    results = pd.concat([results, tempResult], ignore_index=True)

# Display the results
results

  results = pd.concat([results, tempResult], ignore_index=True)
Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1Score
0,Support Vector Machine,0.968839,0.964508,0.968839,0.966334
1,Decision Tree,0.929178,0.929461,0.929178,0.92906
2,Random Forest,0.929178,0.930304,0.929178,0.929217
3,AdaBoost,0.29745,0.289378,0.29745,0.230668
4,XGBoost,0.929178,0.929816,0.929178,0.929155
5,Naive Bayes,0.470255,0.410575,0.470255,0.429052


In [4]:
from catboost import CatBoostClassifier


ModuleNotFoundError: No module named 'catboost'

In [5]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [6]:
from catboost import CatBoostClassifier
