In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV # For hyperparameter tuning 

# # I'll be going preprocessing data for a supervised machine learning task.
# # Load the dataset
# titanic_data = pd.read_csv('csv_files/bios.csv', sep=',')
# # get_dummies is a crucial function in pandas for converting categorical variables into a format(strings to numbers) that can be provided to ML algorithms to do a better job in prediction.
# titanic_data = pd.get_dummies(titanic_data, columns=['born_city'], drop_first=True) # drop_first=True to avoid the dummy variable trap
# titanic_data.head()

x, y = make_classification(n_samples=90000, n_features=18, n_informative=4, n_redundant=12, random_state=42)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3)

train_scores, test_scores = [], []
depths = range(1, 21)

# for depth in depths:
#     model = DecisionTreeClassifier(max_depth=depth)
#     model.fit(xtrain, ytrain)
#     train_pred = model.predict(xtrain)
#     test_pred = model.predict(xtest)
#     train_accuracy = accuracy_score(ytrain, train_pred)
#     test_accuracy = accuracy_score(ytest, test_pred)
#     train_scores.append(train_accuracy)
#     test_scores.append(test_accuracy)
#     # print(f"Depth: {depth}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")

# plt.plot(depths, train_scores, label='Train Accuracy', marker='o' )
# plt.plot(depths, test_scores, label='Test Accuracy', marker='o')
# plt.legend()

param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 21), 'min_samples_split': range(2, 11)} # Hyperparameter grid used for evaluating the multiple parameters in succession and finding the best combination.

new_model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring="accuracy", n_jobs=-1)
new_model.fit(xtrain, ytrain)
best_model = new_model.best_estimator_
print("Best Hyperparameters:", new_model.best_params_)
print(f"For training data, best model accuracy: {accuracy_score(ytrain, best_model.predict(xtrain)):.4f}")
print(f"For test data, best model accuracy: {accuracy_score(ytest, best_model.predict(xtest)):.4f}")

In [None]:
# Linear regression - A supervised learning algorithm used for predicting continuous values.