In [None]:
#TASK 1:
#Load the heart disease dataset
# ● Perform necessary EDA and Data Wrangling and Implement Logistic Regression
# ● First train your model using penalty as l1 regularization
# ● Train your model with l2 regularization
# ● Train your model with penalty = elastic net
# ● Now compare the results of training and testing accuracy
# ● Discuss in a text cell, what error you have gone through while implementing this penalty,
# Have you changed other parameter to apply these Penalties??
# ● What additional parameter you have changed while implementing the mentioned penalty, what is the relationship between these?

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv(r'C:\Users\i\Downloads\heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [25]:
print("Missing values in each column:")
print(df.isnull().sum())

print("\nData types:")
print(df.info())

print("\nDataset description:")
print(df.describe())

print("\nTarget variable distribution:")
print(df['target'].value_counts()) 

X = df.drop(columns=['target']) 
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

Missing values in each column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Data types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(

In [28]:
penalties = ['l1', 'l2', 'elasticnet']
results = []

for penalty in penalties:
    print(f"Training with penalty: {penalty}")
    if penalty == 'elasticnet':
        solver = 'saga' 
    else:
        solver = 'liblinear'  

    model = LogisticRegression(
        penalty=penalty,
        solver=solver,
        l1_ratio=0.5 if penalty == 'elasticnet' else None, 
        random_state=42,
        max_iter=1000
    )

    try:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        print(f"Training Accuracy: {train_accuracy}")
        print(f"Testing Accuracy: {test_accuracy}")

        results.append({
            'Penalty': penalty,
            'Train Accuracy': train_accuracy,
            'Test Accuracy': test_accuracy
        })

    except Exception as e:
        print(f"Error with penalty {penalty}: {e}")

Training with penalty: l1
Training Accuracy: 0.8463414634146341
Testing Accuracy: 0.8097560975609757
Training with penalty: l2
Training Accuracy: 0.8463414634146341
Testing Accuracy: 0.8097560975609757
Training with penalty: elasticnet
Training Accuracy: 0.8463414634146341
Testing Accuracy: 0.8097560975609757


In [31]:
results_df = pd.DataFrame(results)

print("\nComparison of penalties:")
print(results_df)

results_df.to_csv('logistic_regression_results.csv', index=False)


Comparison of penalties:
      Penalty  Train Accuracy  Test Accuracy
0          l1        0.846341       0.809756
1          l2        0.846341       0.809756
2  elasticnet        0.846341       0.809756


In [None]:
# Errors Encountered:
# Using elasticnet requires the solver to be saga. Using another solver results in an error. l1_ratio is only relevant for elasticnet; leaving it out for other penalties will raise an error.

# Parameter Changes:
# When using elasticnet, the additional parameter l1_ratio defines the proportion of L1 and L2 regularization. Adjusting this impacts the model's performance. Solvers like saga support all penalties, while liblinear only supports l1 and l2.

# Relationship Between Parameters:
# Penalty and Solver: Specific solvers are compatible with specific penalties. Penalty and Regularization Strength: The penalty type impacts how coefficients are penalized, which affects model accuracy.

In [None]:
# TASK 2:
# Implement Logistic regression on the Iris dataset , now choose different solvers 'lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’ Compare the results of Training and testing accuracy in a Dataframe 

# • Briefly discuss the effect of solver on your dataset ,
# • Have you found the similarity as mentioned by Sklearn that which solver is best for small, medium or larger dataset.
# • Which solver is best in your case and why?
# • Now copy this file and apply a new Dataset (Heart Disease) and compare, Does it really affected by the size of dataset?

In [32]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

print(X.head())

print(f"Target values: {iris.target_names}")

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
Target values: ['setosa' 'versicolor' 'virginica']


In [34]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
results = []

for solver in solvers:
    print(f"Training with solver: {solver}")
    try:
        model = LogisticRegression(solver=solver, multi_class='auto', max_iter=1000, random_state=42)
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)

        print(f"Training Accuracy: {train_accuracy}")
        print(f"Testing Accuracy: {test_accuracy}")

        results.append({'Solver': solver, 'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy})
        
    except Exception as e:
        print(f"Error with solver {solver}: {e}")

Training with solver: lbfgs
Training Accuracy: 0.9583333333333334
Testing Accuracy: 0.9333333333333333
Training with solver: liblinear
Training Accuracy: 0.925
Testing Accuracy: 0.8333333333333334
Training with solver: newton-cg
Training Accuracy: 0.9583333333333334
Testing Accuracy: 0.9333333333333333
Training with solver: newton-cholesky
Training Accuracy: 0.95
Testing Accuracy: 0.9
Training with solver: sag
Training Accuracy: 0.9583333333333334
Testing Accuracy: 0.9333333333333333
Training with solver: saga
Training Accuracy: 0.9583333333333334
Testing Accuracy: 0.9333333333333333


In [38]:
results_df = pd.DataFrame(results)

print("\nComparison of solvers:")
print(results_df)

results_df.to_csv('solver_comparison_results.csv', index=False)


Comparison of solvers:
            Solver  Train Accuracy  Test Accuracy
0            lbfgs        0.958333       0.933333
1        liblinear        0.925000       0.833333
2        newton-cg        0.958333       0.933333
3  newton-cholesky        0.950000       0.900000
4              sag        0.958333       0.933333
5             saga        0.958333       0.933333


In [51]:
df_hd = pd.read_csv(r'C:\Users\i\Downloads\heart.csv')

solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
results_heart_disease = []

for solver in solvers:
    try:
        model_hd = LogisticRegression(solver=solver, multi_class='auto', max_iter=1000, random_state=42)
        model_hd.fit(X_hd_train, y_hd_train)
        
        y_hd_train_pred = model_hd.predict(X_hd_train)
        y_hd_test_pred = model_hd.predict(X_hd_test)
        
        train_acc = accuracy_score(y_hd_train, y_hd_train_pred)
        test_acc = accuracy_score(y_hd_test, y_hd_test_pred)

        results_heart_disease.append({'Solver': solver, 'Train Accuracy': train_acc, 'Test Accuracy': test_acc})
    except Exception as e:
        results_heart_disease.append({'Solver': solver, 'Train Accuracy': None, 'Test Accuracy': None, 'Error': str(e)})

results_hd_df = pd.DataFrame(results_heart_disease)
print("\nHeart Disease Dataset Results:")
print(results_hd_df)

results_iris_df['Dataset'] = 'Iris'
results_hd_df['Dataset'] = 'Heart Disease'
combined_results = pd.concat([results_iris_df, results_hd_df], ignore_index=True)

print("\nCombined Results:")
print(combined_results)

combined_results.to_csv('solver_comparison_combined_results.csv', index=False)


Heart Disease Dataset Results:
            Solver  Train Accuracy  Test Accuracy
0            lbfgs        0.846341       0.809756
1        liblinear        0.846341       0.809756
2        newton-cg        0.846341       0.809756
3  newton-cholesky        0.846341       0.809756
4              sag        0.846341       0.809756
5             saga        0.846341       0.809756

Combined Results:
             Solver  Train Accuracy  Test Accuracy        Dataset
0             lbfgs        0.958333       0.933333           Iris
1         liblinear        0.925000       0.833333           Iris
2         newton-cg        0.958333       0.933333           Iris
3   newton-cholesky        0.950000       0.900000           Iris
4               sag        0.958333       0.933333           Iris
5              saga        0.958333       0.933333           Iris
6             lbfgs        0.846341       0.809756  Heart Disease
7         liblinear        0.846341       0.809756  Heart Disease
8    

In [None]:
# Accuracy Comparison Across Datasets:
# For the Iris dataset, the training and testing accuracies are relatively higher (e.g., around 95.8% and 93.3%, respectively) across most solvers. This is expected because the Iris dataset is smaller (150 samples) and well-structured.
# For the Heart Disease dataset, the training and testing accuracies are lower (e.g., around 84.6% and 80.9%) across all solvers. The dataset is larger and potentially noisier, making it harder to achieve higher accuracy.

#Effect of Solvers on Performance:
#For both datasets, the solvers (lbfgs, liblinear, newton-cg, newton-cholesky, sag, saga) produce identical accuracies. This indicates that the solvers are equally effective given the structure and size of these datasets. However, in practice, solvers like sag and saga are optimized for large datasets, while liblinear works well with smaller datasets. The lack of differences here may be because both datasets are not large enough to stress-test solvers like sag and saga.

# Training vs. Testing Accuracy:
# For both datasets, the training accuracy is consistently higher than testing accuracy. This suggests slight overfitting, particularly on the Heart Disease dataset.

# Effect of Dataset Size:
# The Iris dataset is smaller (150 samples), and the models achieve near-perfect results.
# The Heart Disease dataset is larger, and its real-world nature introduces challenges like noise and variability, leading to lower accuracy.

In [None]:
# TASK 3:
# • Implement Perceptron on Iris Data and the Compare the Results of Logistic Regression (Sklearn) model with Perceptron model. Compare Training and Testing results of Perceptron and LR
# • What is the difference between Perceptron and LR

In [55]:
from sklearn.linear_model import Perceptron, LogisticRegression

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [57]:
results = []

perceptron_model = Perceptron(max_iter=1000, random_state=42)
perceptron_model.fit(X_train, y_train)
y_train_pred_p = perceptron_model.predict(X_train)
y_test_pred_p = perceptron_model.predict(X_test)
train_acc_p = accuracy_score(y_train, y_train_pred_p)
test_acc_p = accuracy_score(y_test, y_test_pred_p)
results.append({'Model': 'Perceptron', 'Train Accuracy': train_acc_p, 'Test Accuracy': test_acc_p})

log_reg_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='auto', solver='lbfgs')
log_reg_model.fit(X_train, y_train)
y_train_pred_lr = log_reg_model.predict(X_train)
y_test_pred_lr = log_reg_model.predict(X_test)
train_acc_lr = accuracy_score(y_train, y_train_pred_lr)
test_acc_lr = accuracy_score(y_test, y_test_pred_lr)
results.append({'Model': 'Logistic Regression', 'Train Accuracy': train_acc_lr, 'Test Accuracy': test_acc_lr})

In [59]:
results_df = pd.DataFrame(results)
print("\nComparison of Perceptron and Logistic Regression Models:")
print(results_df)

results_df.to_csv('perceptron_vs_logistic_regression.csv', index=False)


Comparison of Perceptron and Logistic Regression Models:
                 Model  Train Accuracy  Test Accuracy
0           Perceptron        0.933333       0.866667
1  Logistic Regression        0.958333       0.933333


In [None]:
# Model Differences:
# Perceptron: Works as a linear classifier, where it tries to find a hyperplane that separates the classes. It uses a step function for predictions, which may not always converge if the data isn't linearly separable.
# Logistic Regression: A probabilistic model that predicts the probability of belonging to a class. It uses a sigmoid function and is more robust for non-linearly separable data.

# Performance Comparison: Logistic Regression typically outperforms Perceptron on datasets like Iris due to its probabilistic approach and better handling of overlap between classes.

# Key Observations:
# If the Perceptron achieves lower accuracy, it could be due to the Iris dataset having some non-linearly separable regions.
# Logistic Regression is a more stable and widely applicable model for classification tasks.