## II. Programming

#### 1)

In [None]:
import pandas as pd
from scipy.io.arff import loadarff
from sklearn.feature_selection import f_classif

data = loadarff('column_diagnosis.arff')
df = pd.DataFrame(data[0])
df.head()

In [None]:
X = df.drop('class', axis=1)
y = df['class']
y = y.astype(str)

fimportance = f_classif(X, y)

feature_importance_df = pd.DataFrame({'Feature': X.columns, 'F-Score': fimportance[0]})

index_of_highest_fscore = feature_importance_df['F-Score'].idxmax()
index_of_lowest_fscore = feature_importance_df['F-Score'].idxmin()

highest_feature = feature_importance_df.iloc[index_of_highest_fscore]
lowest_feature = feature_importance_df.iloc[index_of_lowest_fscore]

print("Feature with Highest F-Score:")
print(highest_feature)

print("\nFeature with Lowest F-Score:")
print(lowest_feature)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.boxplot(x="class", y='degree_spondylolisthesis', data=df)      
plt.show()

In [None]:
# max_value_index = df['degree_spondylolisthesis'].idxmax()
# df = df.drop(max_value_index)

highest_feature_name = highest_feature['Feature']
lowest_feature_name = lowest_feature['Feature']

plt.figure(figsize=(12, 6))

# Plot for the feature with the highest F-Score
plt.subplot(1, 2, 1)
for class_label in df['class'].unique():
    sns.kdeplot(data=df[df['class'] == class_label], x=highest_feature_name, label=class_label.decode('utf-8'))
plt.title(f'Class-Conditional PDF for {highest_feature_name} (Highest F-Score)')
plt.legend()

# Plot for the feature with the lowest F-Score
plt.subplot(1, 2, 2)
for class_label in df['class'].unique():
    sns.kdeplot(data=df[df['class'] == class_label], x=lowest_feature_name, label=class_label.decode('utf-8'))
plt.title(f'Class-Conditional PDF for {lowest_feature_name} (Lowest F-Score)')
plt.legend()

plt.tight_layout()
plt.show()

#### 2)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree, metrics
from adjustText import adjust_text  # Import adjust_text function


depths = {1,2,3,4,5,6,8,10}

train_accuracies = []
test_accuracies = []

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)

for depth in depths:
    predictor = tree.DecisionTreeClassifier(max_depth=depth)
    predictor.fit(X_train, y_train)
    
    # Training Accuracy
    y_train_pred = predictor.predict(X_train)
    train_accuracy = metrics.accuracy_score(y_train, y_train_pred)
    train_accuracies.append(train_accuracy)
    
    y_test_pred = predictor.predict(X_test)
    test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
    test_accuracies.append(test_accuracy)

plt.figure(figsize=(8, 6))
for i, depth in enumerate(depths):
    plt.scatter(train_accuracies[i], test_accuracies[i], label=f'Depth {depth}')
    plt.annotate(f'Depth {depth}', (train_accuracies[i], test_accuracies[i]), textcoords='offset points', xytext=(5,5), ha='center')

plt.title('Train vs. Test Accuracies for Different Depths')
plt.xlabel('Training Accuracy')
plt.ylabel('Testing Accuracy')
plt.legend()
plt.grid(True)
plt.show()
