# Classfication


In [92]:
# Task 1: Classification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')




In [93]:
# Step 1: Loading Data, Data Pre-processing, EDA
data = pd.read_csv("Dataset2/healthcare-dataset-stroke-data.csv")

data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [94]:
# Handle missing values
data['bmi'].fillna(data['bmi'].mean(), inplace=True)



In [95]:
# Step 2: Feature Engineering, Creating Train, and Test Datasets
# Encoding categorical variables
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['ever_married'] = label_encoder.fit_transform(data['ever_married'])
data['work_type'] = label_encoder.fit_transform(data['work_type'])
data['Residence_type'] = label_encoder.fit_transform(data['Residence_type'])
data['smoking_status'] = label_encoder.fit_transform(data['smoking_status'])

# Splitting the data into features and target variable
X = data.drop(['id', 'stroke'], axis=1)
y = data['stroke']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [96]:
# Step 3: Apply at least 2 algorithms for classification (Training and Testing)
# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
rf_pred = rf_classifier.predict(X_test)



In [97]:
# Logistic Regression
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train, y_train)
lr_pred = lr_classifier.predict(X_test)



In [98]:
# Step 4: Generate at least 2 Evaluation Metrics on each algorithm.
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

rf_metrics = evaluate_model(y_test, rf_pred)
lr_metrics = evaluate_model(y_test, lr_pred)
print("Random Forest Accuracy: ",rf_metrics)
print("Logistis Regression Accuracy: ",lr_metrics)

Random Forest Accuracy:  0.9383561643835616
Logistis Regression Accuracy:  0.9403131115459883


In [99]:
# Step 5: Comparing the results.
print("Random Forest Metrics: Accuracy={}",rf_metrics)
print("Logistic Regression Metrics: Accuracy={}",lr_metrics)

# Step 6: Fine Tune the best algorithm (if needed).
# For fine-tuning, you can use techniques like GridSearchCV or RandomizedSearchCV.

Random Forest Metrics: Accuracy={} 0.9383561643835616
Logistic Regression Metrics: Accuracy={} 0.9403131115459883


In [100]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_lr_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_lr_pred = best_lr_model.predict(X_test)
best_lr_accuracy = accuracy_score(y_test, best_lr_pred)

print("Accuracy of Fine-Tuned Logistic Regression Model:", best_lr_accuracy)


Best Parameters: {'C': 100, 'penalty': 'l2'}
Accuracy of Fine-Tuned Logistic Regression Model: 0.9393346379647749


In [101]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the parameter distributions to sample from
param_distributions = {
    'C': uniform(0.01, 100),
    'penalty': ['l1', 'l2']
}

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(LogisticRegression(random_state=42), param_distributions, n_iter=100, cv=5, scoring='accuracy', random_state=42)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_lr_model = random_search.best_estimator_

# Evaluate the best model on the test set
best_lr_pred = best_lr_model.predict(X_test)
best_lr_accuracy = accuracy_score(y_test, best_lr_pred)

print("Accuracy of Fine-Tuned Logistic Regression Model:", best_lr_accuracy)


Best Parameters: {'C': 50.27790232288615, 'penalty': 'l2'}
Accuracy of Fine-Tuned Logistic Regression Model: 0.9383561643835616
