# Classification


In [5]:
# Step 1: Loading Data, Data Pre-processing, EDA
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load training data
train_data = pd.read_csv('Dataset1/Training.csv')

# Load testing data
test_data = pd.read_csv('Dataset1/Testing.csv')



# Split training data into features and target variable
X_train = train_data.drop('prognosis', axis=1)  # Features
y_train = train_data['prognosis']  # Target variable

# Split testing data into features and target variable
X_test = test_data.drop('prognosis', axis=1)  # Features
y_test = test_data['prognosis']  # Target variable

#Apply at least 2 algorithms for classification (Training and Testing)
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Algorithm 1: Example - Random Forest
from sklearn.ensemble import RandomForestClassifier

# Initialize the classifier
rf_classifier = RandomForestClassifier()

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Test the classifier
rf_predictions = rf_classifier.predict(X_test)

# Generate at least 2 Evaluation Metrics on each algorithm.
# Evaluation Metrics for Random Forest
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Algorithm 2: Example - Logistic Regression
from sklearn.linear_model import LogisticRegression

# Initialize the classifier
lr_classifier = LogisticRegression()

# Train the classifier
lr_classifier.fit(X_train, y_train)

# Test the classifier
lr_predictions = lr_classifier.predict(X_test)

# Evaluation Metrics for Logistic Regression
lr_accuracy = accuracy_score(y_test, lr_predictions)

#Comparing the results
print("Random Forest Classifier Results:")
print("Accuracy:", rf_accuracy)

print("\nLogistic Regression Results:")
print("Accuracy:", lr_accuracy)

#Fine Tune the best algorithm
# Fine-tune parameters of the best performing algorithm (e.g., Random Forest)

# Example of hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the grid search
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1)

# Perform grid search to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Reinitialize the classifier with the best parameters
best_rf_classifier = RandomForestClassifier(**best_params)

# Train the classifier with the best parameters
best_rf_classifier.fit(X_train, y_train)

# Test the classifier with the best parameters
best_rf_predictions = best_rf_classifier.predict(X_test)

# Evaluate the classifier with the best parameters
best_rf_accuracy = accuracy_score(y_test, best_rf_predictions)

print("\nBest Random Forest Classifier Results after Fine Tuning:")
print("Accuracy:", best_rf_accuracy)



Random Forest Classifier Results:
Accuracy: 0.9761904761904762

Logistic Regression Results:
Accuracy: 1.0

Best Random Forest Classifier Results after Fine Tuning:
Accuracy: 0.9761904761904762


In [6]:
# Step 1: Loading Data, Data Pre-processing, EDA
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
train_data = pd.read_csv('Dataset1/Training.csv')
test_data = pd.read_csv('Dataset1/Testing.csv')


# Split data into features and target variable
X_train = train_data.drop('prognosis', axis=1)  # Features
y_train = train_data['prognosis']  # Target variable

X_test = test_data.drop('prognosis', axis=1)  # Features
y_test = test_data['prognosis']  # Target variable

#Apply at least 2 algorithms for classification (Training and Testing)
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Algorithm 1: Random Forest
from sklearn.ensemble import RandomForestClassifier

# Initialize the classifier
rf_classifier = RandomForestClassifier()

# Perform cross-validation
rf_cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)

# Display cross-validation scores
print("Random Forest Classifier Cross-Validation Scores:", rf_cv_scores)
print("Mean Accuracy:", np.mean(rf_cv_scores))

# Train the classifier on the entire training data
rf_classifier.fit(X_train, y_train)

# Test the classifier
rf_predictions = rf_classifier.predict(X_test)

#Generate at least 2 Evaluation Metrics on each algorithm.
# Evaluation Metrics for Random Forest
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Classifier Test Accuracy:", rf_accuracy)

# Algorithm 2: Logistic Regression
from sklearn.linear_model import LogisticRegression

# Initialize the classifier
lr_classifier = LogisticRegression()

# Perform cross-validation
lr_cv_scores = cross_val_score(lr_classifier, X_train, y_train, cv=5)

# Display cross-validation scores
print("\nLogistic Regression Cross-Validation Scores:", lr_cv_scores)
print("Mean Accuracy:", np.mean(lr_cv_scores))

# Train the classifier on the entire training data
lr_classifier.fit(X_train, y_train)

# Test the classifier
lr_predictions = lr_classifier.predict(X_test)

# Evaluation Metrics for Logistic Regression
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("Logistic Regression Test Accuracy:", lr_accuracy)


Random Forest Classifier Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Accuracy: 1.0
Random Forest Classifier Test Accuracy: 0.9761904761904762

Logistic Regression Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Accuracy: 1.0
Logistic Regression Test Accuracy: 1.0
