In [49]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd


height_data = pd.read_csv('athlete_events.csv')
height_data_new = height_data[["Sex", "Age","Height","Weight", "Sport"]].dropna()


# Encode categorical data (Sex, and possibly Sport)
label_encoder_sex = LabelEncoder()
height_data_new["Sex"] = label_encoder_sex.fit_transform(height_data_new["Sex"])


# Encode the target variable 'Sport' (if necessary)
label_encoder_sport = LabelEncoder()
height_data_new["Sport"] = label_encoder_sport.fit_transform(height_data_new["Sport"])

X_data = height_data_new[["Sex", "Age","Height","Weight"]]
Y_data = height_data_new["Sport"]


X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Model Accuracy: {accuracy:.2f}")



'''# Train the model
clf = clf.fit(X_data, Y_data)

label_encoder_sport.inverse_transform(clf.predict([[0,21,180,80]]))
'''



Model Accuracy: 0.33


'# Train the model\nclf = clf.fit(X_data, Y_data)\n\nlabel_encoder_sport.inverse_transform(clf.predict([[0,21,180,80]]))\n'

In [80]:
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np


height_data = pd.read_csv('athlete_events.csv')
height_data_new = height_data[["Sex", "Age","Height","Weight", "Sport"]].dropna()


# Encode categorical
label_encoder_sex = LabelEncoder()
height_data_new["Sex"] = label_encoder_sex.fit_transform(height_data_new["Sex"])


# Encode the target variable 'Sport'
label_encoder_sport = LabelEncoder()
height_data_new["Sport"] = label_encoder_sport.fit_transform(height_data_new["Sport"])

X_data = height_data_new[["Sex", "Age","Height","Weight"]]
Y_data = height_data_new["Sport"]

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

#Set CLF 
clf = tree.DecisionTreeClassifier()

#One out Model Accuracy
clf_one = clf.fit(X_train, Y_train)
Y_pred = clf_one.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"One Out Model Accuracy: {accuracy:.3f}%")

#Bootstrap Method
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging_clf.fit(X_train, Y_train)
score = bagging_clf.score(X_test, Y_test)
print(f"Bootstrap Test accuracy: {score:.3f}%")

##K_fold Cross Validation
kf = KFold(n_splits=7, shuffle=True)  # 7 folds
scores = cross_val_score(clf, X_data, Y_data, cv=kf)
print(f"Cross-validation scores: {scores}")
print(f"CV Mean accuracy: {scores.mean():.3f}")

#Input for Sex 
while True:
    try:
        sex = int(input("Enter sex (0 for male and 1 for female): "))
        if 0 == sex or 1 == sex:
            break
        else:
            print("Please enter a number within the range.")
    except ValueError:
        print("That's not a valid number!")

#input for Age
while True:
    try:
        age = int(input("Enter age (10-50): "))
        if 0 <= age <= 50:
            break
        else:
            print("Please enter a number within the range.")
    except ValueError:
        print("That's not a valid number!")

#Input for Height 
while True:
    try:
        height = int(input("Enter Height (100 - 250cm): "))
        if 100 <= height <= 250:
            break
        else:
            print("Please enter a number within the range.")
    except ValueError:
        print("That's not a valid number!")
        
    
#Input for Weigth
while True:
    try:
        weight = int(input("Enter Weight (30-150 (kg)): "))
        if 30 <= weight <= 150:
            break
        else:
            print("Please enter a number within the range.")
    except ValueError:
        print("That's not a valid number!")
        
# Create an input array with the user's values
user_input = np.array([[sex, age, height, weight]])

# Make predictions with each model
dt_prediction = clf_one.predict(user_input)
#bagging_prediction = bagging_clf.predict(user_input)

# Convert the encoded prediction back to the actual sport
sport_prediction_dt = label_encoder_sport.inverse_transform(dt_prediction)
#sport_prediction_bagging = label_encoder_sport.inverse_transform(bagging_prediction)

# Output the predictions
print(f"\nPrediction using Decision Tree: {sport_prediction_dt[0]}")
#print(f"Prediction using Bagging Classifier: {sport_prediction_bagging[0]}")



One Out Model Accuracy: 0.330%
Bootstrap Test accuracy: 0.329%
Cross-validation scores: [0.33368417 0.33468016 0.33277876 0.33145457 0.32989271 0.33956947
 0.33675132]
CV Mean accuracy: 0.334
Enter sex (0 for male and 1 for female): 1
Enter age (10-50): 19
Enter Height (100 - 250cm): 167
Enter Weight (30-150 (kg)): 68

Prediction using Decision Tree: Diving


In [79]:

bagging_prediction = bagging_clf.predict(user_input)
sport_prediction_bagging = label_encoder_sport.inverse_transform(bagging_prediction)
print(f"Prediction using Bagging Classifier: {sport_prediction_bagging[0]}")

Prediction using Bagging Classifier: Table Tennis


In [78]:
from scipy.stats import mode
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")
user_input = np.array([[sex, age, height, weight]])

# Store predictions from each fold
predictions = []

# K-Fold cross-validation
for train_index, test_index in kf.split(X_data):
    X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
    Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index]
    
    # Initialize and train the Decision Tree classifier
    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, Y_train)
    
    # Make a prediction for the user input
    pred = clf.predict(user_input)
    predictions.append(pred[0])

# Aggregate the predictions using the majority vote (mode)
final_prediction = mode(predictions).mode[0]

# Convert the prediction back to the actual sport
sport_prediction = label_encoder_sport.inverse_transform([final_prediction])

# Output the final prediction
print(f"\nPrediction using K-Fold Cross-Validation: {sport_prediction[0]}")


Prediction using K-Fold Cross-Validation: Table Tennis


In [73]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd


height_data = pd.read_csv('athlete_events.csv')
height_data_new = height_data[["Sex", "Age","Height","Weight", "Sport"]].dropna()


# Encode categorical data (Sex, and possibly Sport)
label_encoder_sex = LabelEncoder()
height_data_new["Sex"] = label_encoder_sex.fit_transform(height_data_new["Sex"])


# Encode the target variable 'Sport' (if necessary)
label_encoder_sport = LabelEncoder()
height_data_new["Sport"] = label_encoder_sport.fit_transform(height_data_new["Sport"])

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.3, random_state=42)

# Initialize the BaggingClassifier with DecisionTree as the base model
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=10, random_state=42)

# Fit the model
bagging_clf.fit(X_train, Y_train)

# Evaluate the model
score = bagging_clf.score(X_test, Y_test)
print(f"Test accuracy: {score}")

Test accuracy: 0.3217299919159256


In [83]:
height_data_new

Unnamed: 0,Sex,Age,Height,Weight,Sport
0,1,24.0,180.0,80.0,6
1,1,23.0,170.0,60.0,26
4,0,21.0,185.0,82.0,43
5,0,21.0,185.0,82.0,43
6,0,25.0,185.0,82.0,43
...,...,...,...,...,...
271111,1,29.0,179.0,89.0,28
271112,1,27.0,176.0,59.0,40
271113,1,27.0,176.0,59.0,40
271114,1,30.0,185.0,96.0,9
