In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Read the dataset
df=pd.read_csv('data.csv')

In [5]:
# Independent and dependent features
X = df.drop(['writing score'], axis=1)
y = df['writing score']

KeyError: "['writing score'] not found in axis"

In [None]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)

In [None]:
X = preprocessor.fit_transform(X)

_IncompleteInputError: incomplete input (748411014.py, line 1)

In [None]:
X.shape

In [None]:
# Separation of dataset into train and test 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=42)

X_train.shape, X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler
processor=StandardScaler()
X_train_scaled=processor.fit_transform(X_train)
X_test_scaled=processor.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [None]:
models={
    "LogisticRegression":LogisticRegression(),
    "SVM": SVC(),
    "RandomForest":RandomForestClassifier(),
    "AdaBoost":AdaBoostClassifier(),
    "GradientBoost":GradientBoostingClassifier(),
    "XGBoost":XGBClassifier(),
    "KNN":KNeighborsClassifier(),
    "DecisionTree":DecisionTreeClassifier(),
}

accuracy_dict= {}
recall_dict= {}

for i in range(len(list(models))):
    model = list(models.values())[i]
    
    # Training
    model.fit(X_train_scaled, y_train) 

    # Making Predictions
    y_train_pred= model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)  

    # Training Set Performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average="weighted")
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_roc_auc_score = roc_auc_score(y_train, y_train_pred)

    # Test Set Performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average="weighted")
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_roc_auc_score = roc_auc_score(y_test, y_test_pred)


    # Format data for future plots:
    accuracy_dict[list(models.keys())[i]] = [model_train_accuracy, model_test_accuracy]
    
    recall_dict[list(models.keys())[i]] = [model_train_recall, model_test_recall]


    print(list(models.keys())[i])
    print('Model performance for Training set')
    print("-Accuracy: {:.4f}".format(model_train_accuracy))
    print("-F1 score: {:.4f}".format(model_train_f1))
    print("-Precision: {:.4f}".format(model_train_precision))
    print("-Recall: {:.4f}".format(model_train_recall))
    print("-Roc auc score: {:.4f}".format(model_train_roc_auc_score))

    print("."*40)

    print('Model performance for Test set')
    print("-Accuracy: {:.4f}".format(model_test_accuracy))
    print("-F1 score: {:.4f}".format(model_test_f1))
    print("-Precision: {:.4f}".format(model_test_precision))
    print("-Recall: {:.4f}".format(model_test_recall))
    print("-Roc auc score: {:.4f}".format(model_test_roc_auc_score))


   
    print("="*40)
    print('\n')

In [None]:
# Plot Model Performance

# Related with Accuracy
train_list=[]
for key, values in accuracy_dict.items():
   train_list.append(values[0])

test_list=[]
for key, values in accuracy_dict.items():
   test_list.append(values[1])
    
plt.figure(figsize=(10, 6))
sns.barplot(x=list(accuracy_dict.keys()), y=train_list)
plt.xticks(rotation=45)
plt.ylabel("Train Accuracy Score")
plt.title("Model Performance Comparison")
plt.show()
    

plt.figure(figsize=(10, 6))
sns.barplot(x=list(accuracy_dict.keys()), y=test_list)
plt.xticks(rotation=45)
plt.ylabel("Test Accuracy Score")
plt.title("Model Performance Comparison")
plt.show()




In [None]:
params={
    'max_depth': [None, 1, 2, 3, 4, 5, 6, 10, 15, 20],
    'max_features': ['auto', 5, 6, 7, 8],
    'min_samples_split': [2, 8, 15, 20],
    'n_estimators': [100, 200, 500, 100]
}

In [None]:
# Models list for Hyperparameter tuning
randomcv_models= [
    ('Random Forest', RandomForestClassifier(), params)
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}

for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model, 
                                param_distributions=params,
                                n_iter=100,
                                cv=4,
                                verbose=2,
                                n_jobs=-1
                               )

    random.fit(X_train_scaled, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"Best parameters for {model_name} are:")
    print(model_param[model_name])

In [None]:
models={
    "RandomForest":RandomForestClassifier(n_estimators=100, min_samples_split= 2, max_features= 7, max_depth= 20),
}
    
for i in range(len(list(models))):
    model = list(models.values())[i]
    
    # Training
    model.fit(X_train_scaled, y_train) 

    # Making Predictions
    y_train_pred= model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)  

    # Training Set Performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average="weighted")
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_roc_auc_score = roc_auc_score(y_train, y_train_pred)

    # Test Set Performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average="weighted")
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_roc_auc_score = roc_auc_score(y_test, y_test_pred)


    print(list(models.keys())[i])
    print('Model performance for Training set')
    print("-Accuracy: {:.4f}".format(model_train_accuracy))
    print("-F1 score: {:.4f}".format(model_train_f1))
    print("-Precision: {:.4f}".format(model_train_precision))
    print("-Recall: {:.4f}".format(model_train_recall))
    print("-Roc auc score: {:.4f}".format(model_train_roc_auc_score))

    print("."*40)

    print('Model performance for Test set')
    print("-Accuracy: {:.4f}".format(model_test_accuracy))
    print("-F1 score: {:.4f}".format(model_test_f1))
    print("-Precision: {:.4f}".format(model_test_precision))
    print("-Recall: {:.4f}".format(model_test_recall))
    print("-Roc auc score: {:.4f}".format(model_test_roc_auc_score))

    print("="*40)
    print('\n')