# Data Mining Practice Project

#### This project was written by:
- Kyrsti Fitts
- Shivani Merchant
- Kevin Reynolds
- Ryan Espejo

#### Run Setup
- Use the most recent versions of Pandas and Scikit-learn

<br><br><br>
##### Step 1: Load the data
- For the first step of this project, we will load the data from the data files.
- The data will be loaded into a pandas data frame.
- The data frame will be the data structure that holds our data.
- Add attribute name rows for the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

# Load the data (replace with the actual paths to your data files) and create attribute name row
attribute_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
training_data = pd.read_csv('adult/training_data.csv', names = attribute_names)
test_data = pd.read_csv('adult/test_data.csv', names=attribute_names)

# Print number of rows and colums
training_rows_length, training_columns_length = training_data.shape
test_rows_length, test_columns_length = test_data.shape


print(f"Training Data Set: Read in {training_rows_length} rows and {training_columns_length} columns")
print(f"Test Data Set: Read in {test_rows_length} rows and {test_columns_length} columns")
print(training_data.head())

<br><br>
#### Step 1 Continued: Remove rows containing unknown data
- Replace '?' characters with pandas 'NA' objects.
- Use the "dropna()" method to remove the rows.

In [None]:
# Remove records with unknown "?" values
training_data = training_data.replace(' ?', pd.NA)
training_data = training_data.dropna()

test_data = test_data.replace(' ?', pd.NA)
test_data = test_data.dropna()

# Remove period from income in the test data
test_data = test_data.replace(' <=50K.', ' <=50K')
test_data = test_data.replace(' >50K.', ' >50K')

# Display results
print(f"Training Data Set: Removed {training_rows_length - training_data.shape[0]} rows containing unknown values")
print(f"Test Data Set: Removed {test_rows_length - test_data.shape[0]} rows containing unknown values")

#### Step 1 Continued: Remove All Continuous Attributes
- Continuous Attrbutes: age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week
- Drop these attributes from both pandas data frames

In [None]:
print("Training dataset columns before continuous attribute deletion")
print(training_data.columns)
print("Test dataset columns before continuous attribute deletion")
print(test_data.columns)

training_data.drop('age',inplace = True, axis=1)
training_data.drop('fnlwgt',inplace = True, axis=1)
training_data.drop('education-num',inplace = True, axis=1)
training_data.drop('capital-gain',inplace = True, axis=1)
training_data.drop('capital-loss',inplace = True, axis=1)
training_data.drop('hours-per-week',inplace = True, axis=1)

test_data.drop('age',inplace = True, axis=1)
test_data.drop('fnlwgt',inplace = True, axis=1)
test_data.drop('education-num',inplace = True, axis=1)
test_data.drop('capital-gain',inplace = True, axis=1)
test_data.drop('capital-loss',inplace = True, axis=1)
test_data.drop('hours-per-week',inplace = True, axis=1)

print("Training dataset columns after continuous atribute deletion")
print(training_data.columns)
print("Test dataset columns after continuous atribute deletion")
print(test_data.columns)


#### Step 1 Continued: Use one-hot encoding to transform data on each multi-domain categorial attribute
- Using scikit-learn to one-hot encode all the categorical data into numerical data so that it can be used in the algorithms for the next steps
- doing this on the training data

In [None]:
# One hot encoding on the training and test data
training_data = pd.get_dummies(training_data)
test_data = pd.get_dummies(test_data)

# Printing new data head with encoded categorical data
print("Training Data:")
print(training_data.head())

<br><br>
##### Step 2: Build a decision tree classifier
- Separate the targets (income) into two separate dataframes
- X_train = all columns except income
- Y_train = only the income columns
- Create and fit the decision tree classifier
- Print a classification report
- Print a confusion matrix

In [None]:
# Separate the target variables in training_data
X_train = training_data.drop(columns=["class_ <=50K", "class_ >50K"])
y_train = training_data[["class_ <=50K", "class_ >50K"]]

# Separate the target variables in test_data
X_test = test_data.drop(columns=["class_ <=50K", "class_ >50K"])
y_test = test_data[["class_ <=50K", "class_ >50K"]].idxmax(axis=1)

# Add missing column and match order
X_test['native-country_ Holand-Netherlands'] = 0
X_test = X_test[X_train.columns]

# Create a decision tree classifier and fit it to the training data
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Predict the targets of the test data
y_test_prediction = clf.predict(X_test)

# Get the index of the maximum value in each row to get the predicted class labels
y_test_prediction = pd.DataFrame(y_test_prediction, columns=["class_ <=50K", "class_ >50K"]).idxmax(axis=1)

# Print classification report
print(classification_report(y_test, y_test_prediction))

# Create confusion matrix
cm = confusion_matrix(y_test, y_test_prediction, labels=y_test.unique())

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

print("\n" + str(cm[0][0])+ " -> Accurately predicted income >50K")
print(str(cm[1][1]) + " -> Accurately predicted income <=50K")
print(str(cm[1][0]) + " -> Incorrectly predicted income to be >50K when it was actually <=50K")
print(str(cm[0][1]) + " -> Incorrectly predicted income to be <=50K when is was actually >50k")

# More fancy way of displaying the matrix graphically
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

<br></br>
##### Step 3: Build a Naive Bayesian Classifier
- Separate the targets (income) into two separate dataframes
- X_train_nb = all columns except income
- Y_train_nb = only one income column
- Create and fit the naive bayesian classifier
- Print a classification report
- Print a confusion matrix

In [None]:
# Separate the target variables in training_data
X_train_nb = training_data.drop(columns=["class_ <=50K", "class_ >50K"])
y_train_nb = training_data["class_ <=50K"]

# Separate the target variables in test_data
X_test_nb = test_data.drop(columns=["class_ <=50K", "class_ >50K"])
y_test_nb = test_data["class_ <=50K"]

# Add missing column and match order
X_test_nb['native-country_ Holand-Netherlands'] = 0
X_test_nb = X_test_nb[X_train_nb.columns]

# Create a Naive Bayesian Classifier and fit it to the training data
clf = MultinomialNB()
clf.fit(X_train_nb, y_train_nb)

# Predict the targets of the test data
y_test_prediction_nb = clf.predict(X_test_nb)

# Print classification report
print(classification_report(y_test_nb, y_test_prediction_nb))

# Create confusion matrix
cm_nb = confusion_matrix(y_test_nb, y_test_prediction_nb, labels=y_test_nb.unique())

# Print the confusion matrix
print("Confusion Matrix:")
print(cm_nb)

print("\n" + str(cm_nb[0][0])+ " -> Accurately predicted income >50K")
print(str(cm_nb[1][1]) + " -> Accurately predicted income <=50K")
print(str(cm_nb[1][0]) + " -> Incorrectly predicted income to be >50K when it was actually <=50K")
print(str(cm_nb[0][1]) + " -> Incorrectly predicted income to be <=50K when is was actually >50k")

# More fancy way of displaying the matrix graphically
disp = ConfusionMatrixDisplay(confusion_matrix=cm_nb)
disp.plot()
plt.show()


#### Step 4: Setting up for task 2 by loading data and removing records with unknown (?) values
- Replace '?' characters with pandas 'NA' objects.
- Use the "dropna()" method to remove the rows.

In [None]:
training_data_2 = pd.read_csv('adult/training_data.csv', names = attribute_names)
test_data_2 = pd.read_csv('adult/test_data.csv', names=attribute_names)

# Print number of rows and colums
training_rows_length, training_columns_length = training_data_2.shape
test_rows_length, test_columns_length = test_data_2.shape


print(f"Training Data Set: Read in {training_rows_length} rows and {training_columns_length} columns")
print(f"Test Data Set: Read in {test_rows_length} rows and {test_columns_length} columns")

# Remove records with unknown "?" values
training_data_2 = training_data_2.replace(' ?', pd.NA)
training_data_2 = training_data_2.dropna()

test_data_2 = test_data_2.replace(' ?', pd.NA)
test_data_2 = test_data_2.dropna()

# Remove period from income in the test data
test_data_2 = test_data_2.replace(' <=50K.', ' <=50K')
test_data_2 = test_data_2.replace(' >50K.', ' >50K')

print(test_data_2['class'])

# Display results
print(f"Training Data Set: Removed {training_rows_length - training_data_2.shape[0]} rows containing unknown values")
print(f"Test Data Set: Removed {test_rows_length - test_data_2.shape[0]} rows containing unknown values")

#### Step 4 Continued: Keep setting up for task 2 by using one-hot encoding and the mean value to transform the fresh data
- Using scikit-learn to one-hot encode all the categorical data into numerical data so that it can be used in the algorithms for the next steps
- Use mean value for the continuous attributes to convert the numerical values into binary

In [None]:
# One hot encoding on the training and test data
training_data_2 = pd.get_dummies(training_data_2)
test_data_2 = pd.get_dummies(test_data_2)

test_data_2['native-country_ Holand-Netherlands'] = 0
test_data_2 = test_data_2[training_data_2.columns]

# Pick the columns to binarize
columns_to_binarize = ['age','education-num','capital-gain','capital-loss','hours-per-week']

# Calculate the mean values for each feature (column)
training_means = training_data_2[columns_to_binarize].mean()
testing_means = test_data_2[columns_to_binarize].mean()

for column in columns_to_binarize:
    training_binarizer = Binarizer(threshold=training_means[column])
    testing_binarizer = Binarizer(threshold=testing_means[column])

    training_data_2[column] = training_binarizer.transform(training_data_2[[column]].values)
    test_data_2[column] = testing_binarizer.transform(test_data_2[[column]].values)

# Printing new data head with transformed data
print("Training Data:")
print(training_data_2.head())
print("Testing Data:")
print(test_data_2.head())

#### Step 5: K-means Clustering
- Use K-means to cluster data
- Print centroids of the clusters

In [None]:
k_values = [3, 5, 10]
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit(training_data_2)
    centroids = kmeans.cluster_centers_
    print(f"\nCentroids for k={k}:")
    print(centroids)

#### Step 6: kNN Algorithm
- Separate the targets (income) into two separate dataframes
- X_train = all columns except income
- Y_train = only one income column
- Create and fit the kNN algorithm
- Print the three prediction accuracies

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

k_values = [3, 5, 10]
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)

    X_train = training_data_2.drop(columns=["class_ <=50K", "class_ >50K"]).to_numpy()
    y_train = training_data_2["class_ <=50K"].to_numpy()
    
    X_test = test_data_2.tail(10).drop(columns=["class_ <=50K", "class_ >50K"]).to_numpy()
    y_test = test_data_2.tail(10)["class_ <=50K"].to_numpy()

    # Fit the kNN model
    knn.fit(X_train, y_train)

    # Predict on the test data
    y_pred = knn.predict(X_test)

    # Calculate and print accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Prediction Accuracy for k={k}: {accuracy}")



#### Step 7: SVM Classifier
- Separate the targets (income) into two separate dataframes
- X_train_svm = all columns except income
- Y_train_svm = only one income column
- Create and fit to the SVM Classifier
- Print the prediction accuracy

In [None]:
from sklearn.svm import SVC


# Split the dataset into features (X) and labels (y)
X_train_svm = training_data_2.drop(columns=["class_ <=50K", "class_ >50K"])
y_train_svm = training_data_2["class_ <=50K"]

X_test_svm = test_data_2.drop(columns=["class_ <=50K", "class_ >50K"])
y_test_svm = test_data_2["class_ <=50K"]

# Create an SVM classifier 
svm_classifier = SVC()

# Fit the classifier to the training data
svm_classifier.fit(X_train_svm, y_train_svm)

# Predict on the test data
y_pred_svm = svm_classifier.predict(X_test_svm)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_svm, y_pred_svm)
print(f"Accuracy: {accuracy * 100:.2f}%")

#### Step 8: Neural Network Classifier
- Define a simple neural network model with an input layer
- Train the model on the training data.
- Make predictions on the test data.
- Calculate the accuracy.

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

X_train_nn = training_data_2.drop(columns=["class_ <=50K", "class_ >50K"])
y_train_nn = training_data_2["class_ <=50K"]

X_test_nn = test_data_2.drop(columns=["class_ <=50K", "class_ >50K"])
y_test_nn = test_data_2["class_ <=50K"]

# Create a neural network classifier
nn_classifier = MLPClassifier(max_iter=1000, random_state=42)

# Fit the classifier to the training data
nn_classifier.fit(X_train_nn, y_train_nn)

# Predict on the test data
y_pred_nn = (nn_classifier.predict(X_test_nn) > 0.5).astype(int)

# Calculate and print the accuracy
accuracy_nn = accuracy_score(y_test_nn, y_pred_nn)
print(f"Accuracy of Neural Network Classifier: {accuracy_nn * 100:.2f}%")