In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

In [60]:
def model_train(X_train,y_train,X_test,y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=10),
    'SVM': SVC(kernel='linear'),
    'Logistic Regression': LogisticRegression()
    }

    results = {}
    for name, model in models.items():
        if name in ['SVM', 'Logistic Regression']:
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
    
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        results[name] = {'Accuracy': accuracy, 'Precision': precision}

    maxi=0
    for model_name, metrics in results.items():
        print(f"{model_name} - Accuracy: {metrics['Accuracy']:.4f}, Precision: {metrics['Precision']:.4f}")
        if maxi<metrics['Accuracy']:
            maxi=metrics['Accuracy']
            name=model_name
    print("Best model=",name)
    print("Best accuracy=",maxi)
    


In [61]:
from scipy.io import arff

data = arff.loadarff("Rice_Cammeo_Osmancik.arff")[0]
df = pd.DataFrame(data)

df['Class'] = df['Class'].apply(lambda x: x.decode('utf-8'))



X = df[['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length', 'Eccentricity', 'Convex_Area', 'Extent']]
y = df['Class'].apply(lambda x: 1 if x == 'Osmancik' else 0)  # Encoding the target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


model_train(X_train,y_train,X_test,y_test)


Naive Bayes - Accuracy: 0.9151, Precision: 0.9112
Random Forest - Accuracy: 0.9134, Precision: 0.9161
SVM - Accuracy: 0.9283, Precision: 0.9236
Logistic Regression - Accuracy: 0.9300, Precision: 0.9238
Best model= Logistic Regression
Best accuracy= 0.9300087489063867


In [62]:
df = pd.read_csv("adult.data", header=None, delimiter=',', 
                 names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                        "occupation", "relationship", "race", "sex", "capital-gain", 
                        "capital-loss", "hours-per-week", "native-country", "income"])
print(df.income.unique())
df['workclass'].replace(' ?', df['workclass'].mode()[0], inplace=True)
df['occupation'].replace(' ?', df['occupation'].mode()[0], inplace=True)
df['native-country'].replace(' ?', df['native-country'].mode()[0], inplace=True)

categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 
                    'relationship', 'race', 'sex', 'native-country']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
#print(df.head())
X = df.drop('income', axis=1)
y = df['income'].apply(lambda x: 1 if x == ' >50K' else 0)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print(y.unique())

model_train(X_train,y_train,X_test,y_test)

[' <=50K' ' >50K']
[0 1]
Naive Bayes - Accuracy: 0.7992, Precision: 0.6600
Random Forest - Accuracy: 0.8468, Precision: 0.7176
SVM - Accuracy: 0.8482, Precision: 0.7341
Logistic Regression - Accuracy: 0.8519, Precision: 0.7297
Best model= Logistic Regression
Best accuracy= 0.8518783908281298


In [63]:

df = pd.read_csv("echocardiogram.data", header=None, delimiter=',', 
                 names=['survival', 'still-alive', 'age-at-heart-attack', 'pericardial-effusion', 'fractional-shortening', 
                        'epss', 'lvdd', 'wall-motion-score', 'wall-motion-index', 'mult', 'name', 'group', 'alive-at-1'],
                 on_bad_lines='skip')

df.replace('?', np.nan, inplace=True)

for column in ['survival', 'still-alive', 'age-at-heart-attack', 'fractional-shortening', 'epss', 
               'lvdd', 'wall-motion-score', 'wall-motion-index', 'mult', 'alive-at-1']:
    df[column] = pd.to_numeric(df[column], errors='coerce')

df.drop(columns=['group'], inplace=True)

df.dropna(subset=['alive-at-1'], inplace=True)

df.dropna(inplace=True)

label_encoder = LabelEncoder()
df['name'] = label_encoder.fit_transform(df['name'])

features = ['age-at-heart-attack', 'pericardial-effusion', 'fractional-shortening', 'epss', 
            'lvdd', 'wall-motion-score', 'wall-motion-index', 'mult', 'name']
target = 'alive-at-1'

X = df[features]
y = df[target].astype(int) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model_train(X_train,y_train,X_test,y_test)

Naive Bayes - Accuracy: 0.7895, Precision: 0.7500
Random Forest - Accuracy: 0.7368, Precision: 0.6667
SVM - Accuracy: 0.7895, Precision: 0.7500
Logistic Regression - Accuracy: 0.6842, Precision: 0.5000
Best model= Naive Bayes
Best accuracy= 0.7894736842105263
