# Project 4

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
import numpy as np

## Part 1

### Cleaning the data

In [None]:
df = pd.read_csv('cleveland.csv')
df.rename(columns={'num': 'disease'}, inplace=True)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
df.loc[df['ca'] == '?', 'ca'] = None
df.loc[df['thal'] == '?', 'thal'] = None
df['ca'] = df['ca'].apply(lambda a: float(a) if (a is not None) else None)
df['thal'] = df['thal'].apply(lambda a: float(a) if (a is not None) else None)
display(df)
df.dropna().corr()

### Testing predictions using KNN

In [None]:
def testDataset(dataframe, testCol, k=5, attributes=1, verbose=True):
    df = dataframe.copy()

    # Test a random set of attributes
    if type(attributes) == int:
        # If more attributes are specified than there are, just use all of them
        if attributes > len(df.columns)-1:
            attributes = len(df.columns)-1
        attributes = df[df.columns[df.columns != testCol]].sample(axis=1, n=attributes).columns

    # Test a specific set of attributes
    elif type(attributes) != list:
        print('attributes must be an integer or a list of attribute names')
        return


    # Clear out any records that don't have a valid value for one of the attributes in question
    for attribute in attributes:
        if attribute == testCol:
            print(f'Cannot predict {testCol} using {testCol}')
            return
        
        try:
            df = df.dropna(subset=attribute)
        except Exception as e:
            print(e)
            return
    
    print(f'Predicting based on the {k} nearest neighbors using {attributes}:') if verbose else None
    # display(df)

    # Standardize the data
    for attribute in attributes:
        df[attribute] = (df[attribute] - df[attribute].mean()) / df[attribute].std()

    f1Scores = []
    for j in range(10):
        # Use knn. First create a nearest neighbors object.
        nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')
        
        X = df[attributes].values
        y = df[[testCol]].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

        fit = nn.fit(X_train)

        distances, indices = fit.kneighbors(X_test)
        
        y_pred = []
        for i in range(len(indices)):
            nbrs = y_train[indices[i]]
            

            # Predict the class with the highest frequency among neighbors
            greatestValue = df[testCol].unique()[0]

            greatestValueCount = 0
            for value in df[testCol].unique():
                valueCount = [item for sublist in nbrs for item in sublist].count(value)

                if valueCount >= greatestValueCount:
                    greatestValueCount = valueCount
                    greatestValue = value

            # healthy = [item for sublist in nbrs for item in sublist].count(0)
            # sick = [item for sublist in nbrs for item in sublist].count(1)
            predict = greatestValue
            # print(healthy, sick, predict)

            y_pred.append(predict)
        
        (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, zero_division = 0)
        f1Scores.append(f)
        print(f'Test {j}: precision={p}, recall={r}, f-score={f}, support={s}') if verbose else None

    meanF1s = []
    for i in range(len(f1Scores[0])):
        f1Total = 0
        for score in f1Scores:
            f1Total += score[i]
        meanF1s.append(f1Total / len(f1Scores))

    print(f'Mean F1 scores: f-score={meanF1s}') if verbose else None

    f1sTotal = 0
    for f1 in meanF1s:
        f1sTotal += f1
    meanOfF1s = f1sTotal / len(meanF1s)
    meanF1s.append(meanOfF1s)

    print(f'Mean of mean F1 scores: f-score={meanF1s[-1]}') if verbose else None

    return meanF1s, attributes

#### Running random tests to find some promising combinations

In [None]:
# testDataset(df, 'disease', 5, ['oldpeak', 'cp'])
# testDataset(df, 'disease', 5, ['thalach', 'ca', 'thal'])
# testDataset(df, 'disease', 5, ['oldpeak', 'age', 'cp'])
# testDataset(df, 'disease', 5, ['slope', 'thal', 'cp'])
testDataset(df, 'disease', 5, 3)

#### Testing different k values on a promising combination

In [None]:
scores = {'k': [], 'scores': []}
for k in range(1, 200):
    scores['k'].append(k)
    fScores, attributes = testDataset(dataframe=df, testCol='disease', k=k, attributes=['oldpeak', 'cp'], verbose=False)
    scores['scores'].append(fScores)

plt.plot(scores['k'], scores['scores'])
plt.xlabel('k')
plt.ylabel('f score')

#### Testing combinations of k values and numbers of attributes

In [None]:
num_attributes = 11

# Create a color map
colors = viridis(np.linspace(0, 1, num_attributes))
highestKs = []
highestFScores = []
bestAttributes = []
# Plot each curve with a different color based on the number of attributes
for j in range(1, num_attributes + 1):
    scores = {'k': [], 'scores': [], 'attributes': []}
    highestFScore = 0
    highestK = 0
    bestAttribute = []
    for k in range(1, 50):
        scores['k'].append(k)
        new_scores, randomAttributes = testDataset(dataframe=df, testCol='disease', k=k, attributes=j, verbose=False)
        scores['scores'].append(new_scores[2])

        if highestFScore < new_scores[2]:
            highestFScore = new_scores[2]
            highestK = k
            bestAttribute = randomAttributes

    # Use a different color for each curve
    plt.plot(scores['k'], scores['scores'], label=f"{j} attributes", color=colors[j-1])
    highestKs.append(highestK)
    highestFScores.append(highestFScore)
    bestAttributes.append(randomAttributes)
    



plt.xlabel('k')
plt.ylabel('f score')
plt.title('Performance vs. k for Different Number of Attributes')
plt.legend()
plt.show()


display(highestKs)
display(highestFScores)
display(bestAttributes)







In [None]:

max_value = max(highestFScores)
index_of_max = highestFScores.index(max_value)


display(testDataset(df, 'disease', highestKs[index_of_max], list(bestAttributes[index_of_max])))

#### Arriving on a winning combination

Given the results of the test we ran, we are 'usually' around 10 or 11 attributes, so hand select that many. It is also between 20 and 35 K values 'usually', so use that many.

In [None]:
attributesToTest = ['age', 'sex','cp','trestbps','chol','fbs','restecg','ca','oldpeak','exang','thal']
display(testDataset(df, 'disease', 25, attributesToTest))

### Challenge

In [None]:
def challenge(testDataFrame, trainDataFrame, testCol, k=5, attributes=1, verbose=True):
    testDF = testDataFrame.copy()
    trainDF = trainDataFrame.copy()

    # Test a random set of attributes
    if type(attributes) == int:
        # If more attributes are specified than there are, just use all of them
        if attributes > len(testDF.columns)-1:
            attributes = len(testDF.columns)-1
        attributes = testDF[testDF.columns[testDF.columns != testCol]].sample(axis=1, n=attributes).columns

    # Test a specific set of attributes
    elif type(attributes) != list:
        print('attributes must be an integer or a list of attribute names')
        return


    # Clear out any records that don't have a valid value for one of the attributes in question
    for attribute in attributes:
        if attribute == testCol:
            print(f'Cannot predict {testCol} using {testCol}')
            return
        
        try:
            testDF = testDF.dropna(subset=attribute)
            trainDF = trainDF.dropna(subset=attribute)
        except Exception as e:
            print(e)
            return
    
    print(f'Predicting based on the {k} nearest neighbors using {attributes}:') if verbose else None
    # display(testDF)

    # Standardize the data
    for attribute in attributes:
        testDF[attribute] = (testDF[attribute] - testDF[attribute].mean()) / testDF[attribute].std()
        trainDF[attribute] = (trainDF[attribute] - trainDF[attribute].mean()) / trainDF[attribute].std()
    
    # Use knn. First create a nearest neighbors object.
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')
    
    X_train = trainDF[attributes].values
    X_test = testDF[attributes].values
    y_train = trainDF[[testCol]].values
    y_test = testDF[[testCol]].values

    fit = nn.fit(X_train)

    distances, indices = fit.kneighbors(X_test)
    
    y_pred = []
    for i in range(len(indices)):
        nbrs = y_train[indices[i]]
        

        # Predict the class with the highest frequency among neighbors
        greatestValue = trainDF[testCol].unique()[0]

        greatestValueCount = 0
        for value in trainDF[testCol].unique():
            valueCount = [item for sublist in nbrs for item in sublist].count(value)

            if valueCount >= greatestValueCount:
                greatestValueCount = valueCount
                greatestValue = value

        # healthy = [item for sublist in nbrs for item in sublist].count(0)
        # sick = [item for sublist in nbrs for item in sublist].count(1)
        predict = greatestValue
        # print(healthy, sick, predict)

        y_pred.append(predict)
        
    (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, zero_division = 0)
    print(f'precision={p}, recall={r}, f-score={f}, support={s}') if verbose else None

    f1Total = 0
    for f1 in f:
        f1Total += f1
    meanF1 = f1Total / len(f)

    print(f'Mean of F1 scores: f-score={meanF1}') if verbose else None
    
    f1Scores = f.tolist()
    f1Scores.append(meanF1)

    return f1Scores, attributes

#### Cleaning the challenge test dataset

In [None]:
challengeDataset = 'cleveland-test-sample.csv'
challengeDF = pd.read_csv(challengeDataset)
challengeDF.rename(columns={'num': 'disease'}, inplace=True)
challengeDF['disease'] = challengeDF.disease.apply(lambda x: min(x, 1))
challengeDF.loc[challengeDF['ca'] == '?', 'ca'] = None
challengeDF.loc[challengeDF['thal'] == '?', 'thal'] = None
challengeDF['ca'] = challengeDF['ca'].apply(lambda a: float(a) if (a is not None) else None)
challengeDF['thal'] = challengeDF['thal'].apply(lambda a: float(a) if (a is not None) else None)
display(challengeDF)

#### Running a prediction on the challenge dataset using the optimal attributes and k value for a KNN model

In [None]:
attributesToTest = ['age', 'sex','cp','trestbps','chol','fbs','restecg','ca','oldpeak','exang','thal']
k = 25
display(challenge(testDataFrame=challengeDF, trainDataFrame=df, testCol='disease', k=k, attributes=attributesToTest, verbose=True))

## Part 2