# Optimal Quiz Maker
__Author:__ Nathan Swedlund

__Dataset:__ https://www.kaggle.com/miroslavsabo/young-people-survey

## Summary

In this Project, I use several classifiers, along with a survey dataset to guess how a user would answer a question. The way this happens is through a quiz. First, the user enters how many questions they want to answer, then, they enter what question from the survey they want the program to predict their answer to. Once this is done, the program finds the most optimal questions to ask, and then build a classifier based on the answers to those questions. After this, the user is prompted to answer these questions, the answers are recorded, and a prediction is made.

---
## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import neighbors
from sklearn import tree
from sklearn import naive_bayes

from sklearn import preprocessing

---
## Reading in Data

In [2]:
# Reading in initial CSV
df = pd.read_csv("data/responses.csv")

# Removing non-likert columns and putting non-likert DF to a csv file
non_likert_features = ["Religion", "God", "Smoking", "Alcohol", "Age", "Height", 
                       "Weight", "Number of siblings", "Left - right handed",  
                       "Education", "Only child", "Village - town", "House - block of flats",
                      "Punctuality", "Lying", "Internet usage", "Gender"]
df_likert = df.drop(columns=non_likert_features)
df_likert.to_csv("data/Likert.csv")

# Readingin in long-form question strings
questions = pd.read_csv("data/columns.csv")

---
## Creating Functions

In [3]:
def factorizeDF(df):
    '''Returns a factorized version of a given dataframe and the unique class values'''
    temp = df.copy()
    fact_dict = {}
    for x in temp:
        temp[x], fact_dict[x] = pd.factorize(temp[x])
    return temp, fact_dict

def testTrainSplit(df, cl, ratio):
    '''Returns a test/train split over the given data frame where every ratio element will be in the testing data'''
    features = [x for x in df]
    features.remove(cl)
    
    temp, classes = factorizeDF(df)
    train = temp[temp.index % ratio != 0]
    test = temp[temp.index % ratio == 0]
    
    train_X = train[features].values
    train_y = train[cl].values
    
    test_X = test[features].values
    test_y = test[cl].values
    
    return (train_X, train_y, test_X, test_y, classes)

def importantFeatures(df, cl):
    '''Returns an ordered list of features of a given dataframe and their importances 
        in relation to finding a given class'''
    train_X, train_y, test_X, test_y, classes = testTrainSplit(df, cl, 10)
    clf = tree.DecisionTreeClassifier(max_depth=5, random_state=42)
    clf.fit(train_X, train_y)
    
    features = list(df.columns)
    features.remove(cl)
    
    return sorted(zip(clf.feature_importances_, features))
    
    
def getClassifier(df, cl, ratio=2):
    '''Returns the most accurate classifier out of KNeighbors, Decision Tree, and GaussianNB. 
    It will also find the most accurate parameters and fit it on the given data'''
    train_X, train_y, test_X, test_y, classes = testTrainSplit(df, cl, ratio)
    
    best_clf = None
    best_score = 0
    
    # Checking Decision Tree
    max_max_depth = 30
    for n in range(1, max_max_depth, 2):
        DT_clf = tree.DecisionTreeClassifier(max_depth=n, random_state=42)
        DT_clf.fit(train_X, train_y)
        new_score = DT_clf.score(test_X, test_y)
        
        if(best_clf == None or new_score > best_score):
            best_score = new_score
            best_clf = DT_clf
        
    # Checking Gaussian Naive Bayes
    GA_clf = naive_bayes.GaussianNB()
    GA_clf.fit(train_X, train_y)
    new_score = GA_clf.score(test_X, test_y)
    if(new_score > best_score):
        best_score = new_score
        best_clf = GA_clf
        
    # Getting scaled version of the data
    sc = preprocessing.StandardScaler()
    sc_train_X = sc.fit_transform(train_X)
    sc_test_X  = sc.fit_transform(test_X)
        
    # Checking KNeihgbors
    max_neighbors = 10
    for n in range(1, max_neighbors, 2):
        KN_clf = neighbors.KNeighborsClassifier(n_neighbors=n)
        KN_clf.fit(sc_train_X, train_y)
        
        new_score = KN_clf.score(sc_test_X, test_y)
        
        if(new_score > best_score):
            best_score = new_score
            best_clf = KN_clf
            
    return best_clf, classes
    
# Returns the long version of a given short question
def getQuestion(short):
    return questions[questions["short"] == short]["original"].values[0]

---
## Main Program

In [6]:
# Getting variables for questions
print("How many questions do you want to answer? ")
num = int(input())
print("What would you like this program to predict? ")
feature = input()

# Getting the requested number of important features
imp = importantFeatures(df_likert, feature)[-num:]
features = []
for i in imp:
    features.append(i[1])

# Getting DF containing relavent information
df_imp = df_likert[features+[feature]]

# Getting classifier
print("Creating Classifier...")
clf, classes = getClassifier(df_imp, feature)

# Getting question Answers
print("\nAnswer these questions 1 is disagree 5 is agree")

# Printing questions and gather answeres
answers = []
for i in features:
    print(getQuestion(i))
    answers.append(input())

# Getting variables for output
answers = np.array(answers).astype(np.float64).reshape(1,-1)
pred = classes[feature][clf.predict(answers)][0]
question = getQuestion(feature)

# Printing answer
if(pred > 3):
    print("You agree with \'"+question+"\'")
if(pred < 3):
    print("You disagree with \'"+question+"\'")
if(pred == 3):
    print("You are neutral on \'"+question+"\'")

How many questions do you want to answer? 
7
What would you like this program to predict? 
Music
Creating Classifier...

Answer these questions 1 is disagree 5 is agree
I live a very healthy lifestyle.
3
I don't like seeing animals suffering.
5
I like Physics
5
I will find a fault in myself if people don't like me.
4
I like Action movies
4
I always make sure I connect with the right people.
2
I like Socializing
4
You agree with 'I enjoy listening to music.'
