#### Importing

In [4]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MyDummyClassifier, MyNaiveBayesClassifier, MyDecisionTreeClassifier
import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

# Initialize the KNeighborsClassifier
knn_classifier = MyKNeighborsClassifier()
dummy_classifier = MyDummyClassifier()
naive_class = MyNaiveBayesClassifier()
tree_classifier = MyDecisionTreeClassifier()

In [5]:
from random import sample, seed

# Set a random seed for reproducibility
seed(42)

# Load the table from a CSV file
table = MyPyTable().csv_to_mypytable("input_file/fifa_players.csv")

# Filter out rows where the overall rating is less than 70
rating_column = table.get_column("overall_rating")
indexes_to_drop = [index for index, rating in enumerate(rating_column) if int(rating) < 70]
print(f"Rows to drop (rating < 70): {indexes_to_drop}")
print("Size before filtering: ", len(table.data))
table.drop_rows(indexes_to_drop)
print("Size after filtering: ", len(table.data))

# Remove unnecessary columns
columns_to_remove = [
    "name", "full_name", "birth_date", "age", "weight_kgs", "nationality",
    "overall_rating", "potential", "value_euro", "wage_euro", "preferred_foot",
    "international_reputation(1-5)", "weak_foot(1-5)", "body_type", "release_clause_euro",
    "national_team", "national_rating", "national_team_position", "national_jersey_number",
    "heading_accuracy", "volleys", "curve", "sprint_speed", "reactions", "balance",
    "jumping", "strength", "aggression", "penalties", "composure", "sliding_tackle"
]
for column in columns_to_remove:
    if column in table.column_names:  # Check if column exists before removing
        table.remove_column(column)

# Discretize positions
positions = table.get_column("positions")
discretized_positions = [myutils.classify_position(position) for position in positions]
table.add_column("discretized_position", discretized_positions)
print("Discretized positions added to the table.")

# Separate data by position
goalkeepers = [row for row in table.data if row[-1] == "Goalkeeper"]
defenders = [row for row in table.data if row[-1] == "Defender"]
forwards = [row for row in table.data if row[-1] == "Forward"]
midfielders = [row for row in table.data if row[-1] == "Midfielder"]

# Downsample defenders, forwards, and midfielders to the size of goalkeepers
gk_size = len(goalkeepers)
if gk_size > 0:  # Ensure goalkeepers are not empty
    defenders_downsampled = sample(defenders, min(gk_size, len(defenders)))
    forwards_downsampled = sample(forwards, min(gk_size, len(forwards)))
    midfielders_downsampled = sample(midfielders, min(gk_size, len(midfielders)))
else:
    print("No goalkeepers found, cannot balance data.")
    defenders_downsampled, forwards_downsampled, midfielders_downsampled = [], [], []

# Combine the downsampled data
balanced_data = goalkeepers + defenders_downsampled + forwards_downsampled + midfielders_downsampled
print("Balanced data size: ", len(balanced_data))

# Update the table with the balanced data
table.data = balanced_data

# Print final table size and data
print("Final table size: ", len(table.data))
table.pretty_print()

# Print dummy classifier baseline (25%)
dummy_baseline = 0.25
print(f"Dummy classifier baseline (25%): {dummy_baseline}")

# Add a new column before extracting column indices
new_column_name = "custom_metric"
new_column_values = [row[0] * 2 for row in table.data]  # Example transformation of an existing column
if len(new_column_values) == len(table.data):  # Ensure values match the data length
    table.add_column(new_column_name, new_column_values)
else:
    print("Error: Mismatched column size for 'custom_metric'.")

# Update naive_class.header to include the new column
naive_class.header = [
    "height_cm",
    "positions",
    "skill_moves(1-5)",
    "crossing",
    "finishing",
    "short_passing",
    "dribbling",
    "freekick_accuracy",
    "long_passing",
    "ball_control",
    "acceleration",
    "agility",
    "shot_power",
    "stamina",
    "long_shots",
    "interceptions",
    "positioning",
    "vision",
    "marking",
    "standing_tackle",
    "custom_metric"  # Include the new column
]

# Columns to include in the combined list
columns_to_include = naive_class.header

# Extract column indices and construct the combined list
column_indices = [table.column_names.index(col) for col in columns_to_include if col in table.column_names]
combined_list = [[row[idx] for idx in column_indices] for row in table.data]

# Extract the target variable
target = [row[-1] for row in balanced_data]  # Ensure target aligns with discretized_position

# Perform analysis
myutils.perform_analysis(combined_list, target, knn_classifier, dummy_classifier, naive_class, tree_classifier)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 3407: character maps to <undefined>

# Introduction:


This section must briefly describe the dataset you used and the classification task you implemented (e.g., what were you trying to classify in the dataset). You should also briefly describe your findings (e.g., what classifier approach performed the best).

# Data Analysis:

1. Information about the dataset itself, e.g., the attributes and attribute types, the number of instances, and the attribute being used as the label.

2. Relevant summary statistics about the dataset.

3. Data visualizations highlighting important/interesting aspects of your dataset. Visualizations may include frequency distributions, comparisons of attributes (scatterplot, multiple frequency diagrams), box and whisker plots, etc. The goal is not to include all possible diagrams, but instead to select and highlight diagrams that provide insight about the dataset itself.

4. Note that this section must describe the above (in paragraph form) and not just provide diagrams and statistics. Also, each figure included must have a figure caption (Figure number and textual description) that is referenced from the text (e.g., “Figure 2 shows a frequency diagram for ...”).

# Classification Results: 

This section should describe the classification approach you developed and its performance. Explain what techniques you used, briefly how you designed and implemented the classifiers, how you evaluated your classifiers’ predictive ability, and how well the classifiers performed. Thoroughly describe how you evaluated performance, the comparison results, and which classifier is “best”.


# Classification Web App 

 Create a Flask web app with this “best” classifier deployed with an API interface. For the base project (i.e., not bonus), your web app only has to run locally. Part of the bonus is to deploy your web app:

1. BONUS (5 pts): Deploy your Flask web app to a free hosting service such as Render (you do not have to use Render, you may use a different service if you wish). In your repo README.md and your project report, include a link to a deployed web app hosting your Flask app.
2. BONUS (3 pts): Add a user interface to your Flask web app on the index/homepage. The interface should allow the user to enter in attribute values for an unseen instance via a form, press a “Predict” button, and see the prediction for the instance. See the completed Flask-App-Demo repo on Github for a template of how to do this with Flask and a POST request (I will post this after we cover it in class).

# Conclusion: 
Provide a brief conclusion of your project, including a short summary of the dataset you used (and any of its inherent challenges for classification), the classification approach you developed, your classifiers’ performance, and any ideas you have on ways to improve its performance. 

# Acknowledgments: 
This is where you should cite your sources, including any data, code, or materials that are outside of the scope of CPSC 322 (including previous course projects) that you used. As per the course syllabus, you also need to acknowledge any use of AI.