In [38]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import math
import random

def entropyFunct(column):
    valueOccurence = column.value_counts()
    totalColumnElements = len(column)

    entropyCalc = -sum((count / totalColumnElements) * math.log2(count / totalColumnElements) for count in valueOccurence)
    return entropyCalc

def informationGainFunct(data, feature, targetCol):
    initialEntropy = entropyFunct(data[targetCol])

    distinctValues = data[feature].unique()
    conditionalEntropy = 0

    for value in distinctValues:
        subset = data[data[feature] == value]
        weight = len(subset) / len(data)
        conditionalEntropy += weight * entropyFunct(subset[targetCol])

    informationGainCalc = initialEntropy - conditionalEntropy
    return informationGainCalc

def questions(currentData, targetCol):
    features = ['nationality', 'club', 'position', 'age', 'jersey number', 'height', 'weight', 'foot', 'start', 'end']
    validFeatures = [featr for featr in features if featr in currentData.columns and len(currentData[featr].unique()) > 1]

    if validFeatures:
        #Informstion gain for all features
        informationGain = [(feature, informationGainFunct(currentData, feature, targetCol)) for feature in validFeatures]

        #Randomness of features
        random.shuffle(informationGain)

        maximisedGainFeature = random.choice(informationGain)[0]

        return maximisedGainFeature
    else:
        return None

def encodeCatCols(data, columns):
    encodedData = data.copy()
    labelEncoder = LabelEncoder()

    for column in columns:
        if encodedData[column].dtype == 'object':
            encodedData[column] = labelEncoder.fit_transform(encodedData[column])

    return encodedData

def guessingGameModel(data, targetCol, chosenCols):
    encodedData = encodeCatCols(data, chosenCols)

    #Splitting into target and features
    features = encodedData[chosenCols]
    target = encodedData[targetCol]

    #Training the cassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(features, target)

    return model

def prediction(model, data, chosenCols):
    if data is not None:
        
        elements = data[chosenCols].iloc[0].values.reshape(1, -1)
        prediction = model.predict(elements)[0]
        return prediction
    else:
        return None

def filterDataset(data, question, answer):

    if data is not None:
        if answer == 'y':
            return data[data[question] == data[question].iloc[0]]
        elif answer == 'n':
            return data[data[question] != data[question].iloc[0]]
        elif answer == 'py':
            return data[data[question] == data[question].iloc[0]]
        elif answer == 'pn':
            return data[data[question] != data[question].iloc[0]]

             # Return None if the user doesn't know
    else:
        return None

def mainGame(data, model, chosenCols):
    currentData = data.copy()
    targetCol = 'short_name'

    for noOfQuestions in range(1, 21):  #20 questions
        footballQuest = questions(currentData, targetCol)

        if footballQuest is None or len(currentData) == 1:
            print("The player could not be guessed.")
            break

        columnValue = currentData[footballQuest].iloc[0]
        print(f"Question {noOfQuestions}: Is the player's {footballQuest} {columnValue}?")

        userAnswer = input("Respond with either yes(y), no(n), probably yes(py), probably no(pn), I don't know (idk):").lower()

        while userAnswer not in ['y', 'n', 'py', 'pn', "idk"]:
            print("Please try again with a valid response.")
            userAnswer = input("Respond with either yes(y), no(n), probably yes(py), probably no(pn), I don't know (idk):").lower()

        if userAnswer == "idk":
            continue  #If the response is 'idk', then skip

        currentData = filterDataset(currentData, footballQuest, userAnswer)

        if currentData is not None and len(currentData) == 1:
            print(f"The player guessed is {currentData[targetCol].iloc[0]}!")
            break

        if noOfQuestions == 20:
            encodedDataForPrediction = encodeCols(currentData, chosenCols)
            prediction_result = prediction(model, encodedDataForPrediction, chosenCols)
            print(f"The final guess is {prediction_result}.")
            break  


#Loasding the players data set
df = pd.read_csv('players.csv')

# Suitably chosen columns from the data set to be used to ask questions
chosenDataCols = ['nationality', 'club', 'position', 'age', 'jersey number', 'height', 'weight', 'foot', 'start', 'end']  # Replace with your chosen columns

guessingModel = guessingGameModel(df, 'short_name', chosenDataCols)

mainGame(df, guessingModel, chosenDataCols)

Question 1: Is the player's age 35?


Respond with either yes(y), no(n), probably yes(py), probably no(pn), I don't know (idk): y


Question 2: Is the player's club Paris Saint-Germain?


Respond with either yes(y), no(n), probably yes(py), probably no(pn), I don't know (idk): y


Question 3: Is the player's foot Left?


Respond with either yes(y), no(n), probably yes(py), probably no(pn), I don't know (idk): y


The player guessed is L. Messi!
