# Data Cleaning

In [None]:
!pip install numpy==1.23.5 scikit-learn==1.2.2

In [None]:
# Import
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import csv
from sklearn.preprocessing import MinMaxScaler

In [None]:
data = pd.read_csv('complete_nan.csv')
data = data.drop(columns = ['Unnamed: 0'])
print(data)
print(data.columns)

In [None]:
combine_tests = ['40yd','Vertical', 'Bench', 'Broad Jump', '3Cone', 'Shuttle']
for col in combine_tests:
    data[f'{col}_done'] = data[col].notna().astype(int)
print(data)

In [None]:
for col in combine_tests:
    data[col] = data.groupby('pos_abbr')[col].transform(lambda x: x.fillna(x.mean()))
print(data)

In [None]:
print(data[data['Broad Jump'].isna()]['pos_abbr'].unique())

In [None]:
stat_missing_group = {
    '40yd':data[data['40yd'].isna()]['pos_abbr'].unique(),
    'Vertical':data[data['Vertical'].isna()]['pos_abbr'].unique(),
    'Bench':data[data['Bench'].isna()]['pos_abbr'].unique(),
    'Broad Jump':data[data['Broad Jump'].isna()]['pos_abbr'].unique(),
    '3Cone':data[data['3Cone'].isna()]['pos_abbr'].unique(),
    'Shuttle':data[data['Shuttle'].isna()]['pos_abbr'].unique()
}

for stat, missing in stat_missing_group.items():
    train = data[data[stat].notna()]

    X_train = train[['weight', 'height']]
    y_train = train[stat]

    model = LinearRegression()
    model.fit(X_train, y_train)

    for pos in missing:
        predict = data[(data['pos_abbr'] == pos) & (data[stat].isna())]

        if not predict.empty:
            X_test = predict[['weight', 'height']]
            Y_test = model.predict(X_test)

            data.loc[(data['pos_abbr'] == pos) & (data[stat].isna()),stat] = Y_test

In [None]:
print(data.columns)

In [None]:
columns_to_zero = ['Assist Tackles', 'Completion Percentage', 'Completions',
       'Extra Points Made', 'FGM 1-19 yards', 'FGM 20-29 yards',
       'FGM 30-39 yards', 'FGM 40-49 yards', 'FGM 50+ yards',
       'Field Goal Attempts', 'Field Goal Made', 'Field Goal Percentage',
       'Field Goals', 'Forced Fumbles', 'Gross Average Punt Yards',
       'Interception Touchdowns', 'Interception Yards', 'Interceptions',
       'Kick Extra Points', 'Kick Return Touchdowns', 'Kick Return Yards',
       'Kick Returns', 'Long Field Goal Made', 'Long Kick Return', 'Long Punt',
       'Long Punt Return', 'Long Reception', 'Long Rushing', 'Longest Pass',
       'Passer Rating', 'Passes Defended', 'Passing Attempts',
       'Passing Touchdowns', 'Passing Yards', 'Punt Return Fair Catches',
       'Punt Return Touchdowns', 'Punt Return Yards', 'Punt Returns',
       'Punt Yards', 'Punts', 'Receiving Touchdowns', 'Receiving Yards',
       'Receptions', 'Return Touchdowns', 'Rushing Attempts',
       'Rushing Touchdowns', 'Rushing Yards', 'Sacks', 'Solo Tackles',
       'Total Kicking Points', 'Total Sacks', 'Total Tackles',
       'Total Touchdowns', 'Total Two Point Conversions',
       'Yards Per Pass Attempt', 'Yards Per Reception',
       'Yards Per Rush Attempt', 'career_avg_total_qbr', 'career_avg_qb_plays',
       'career_avg_total_epa', 'career_avg_pass', 'career_avg_run',
       'career_avg_exp_sack', 'career_avg_penalty', 'career_avg_raw_qbr',
       'career_avg_sack', 'career_sum_points_added']

In [None]:
data[columns_to_zero] = data[columns_to_zero].fillna(0)

In [None]:
rows_with_nan = data[data.isna().any(axis=1)]

columns_with_nan_per_row = rows_with_nan.isna()

for index, row in columns_with_nan_per_row.iterrows():
    nan_columns = row[row].index.tolist()

# Since these rows are missing scouting reports and that's the whole premise unless we get better data for scouting reports we will not include these rows

In [None]:
cleaned_data = data.dropna()

has_nan = cleaned_data.isna().any().any()


In [None]:
print(cleaned_data[(cleaned_data['pos_abbr'] == 'LS') & (cleaned_data['draft_year'] == 2015)])

In [None]:
player_id_mapping = cleaned_data[['player_id', 'player_name', 'pos_abbr', 'draft_year']]
cleaned_data['pos_abbr'] = cleaned_data['pos_abbr'].replace({'ILB': 'LB', 'OLB': 'LB'})
pos_abbr_counts = cleaned_data['pos_abbr'].value_counts()
cleaned_data = cleaned_data.drop(columns = ['draft_year'])

In [None]:
print(cleaned_data['pos_abbr'].unique())

In [None]:
cleaned_data.to_csv('cleaned_data.csv', index = False)

In [None]:
unique_positions = cleaned_data['pos_abbr'].unique()

quant_map = {}
qual_map = {}

for p in unique_positions:

    filter = cleaned_data[cleaned_data['pos_abbr'] == p]
    filter = filter.drop(columns=['player_name', 'pos_abbr'])

    if 'scouting' in filter.columns:
        scouting_col = filter['scouting']
        filter = filter.drop(columns=['scouting'])
    else:
        scouting_col = None  
        
    if 'player_id' in filter.columns:
        player_ids = filter['player_id']
        filter = filter.drop(columns=['player_id'])
    else:
        player_ids = None  

    indicator_cols = [col for col in filter.columns if filter[col].dropna().isin([0, 1]).all()]
    numeric_cols = filter.select_dtypes(include=['number']).columns.difference(indicator_cols)

    scaler = MinMaxScaler()
    filter[numeric_cols] = scaler.fit_transform(filter[numeric_cols])

    if player_ids is not None:
        filter['player_id'] = player_ids

    quant_map[p] = filter.copy()

    if scouting_col is not None:
        filter['scouting'] = scouting_col

    qual_map[p] = filter.copy()


In [None]:
quant_list = []
for key, df in quant_map.items():
    df = df.copy()
    df.insert(0, 'key', key)
    quant_list.append(df)

final_quant_df = pd.concat(quant_list)
final_quant_df.to_csv('quant_map.csv', index=False)

In [None]:
qual_list = []
for key, df in qual_map.items():
    df = df.copy()
    df.insert(0, 'key', key)
    qual_list.append(df)

final_qual_df = pd.concat(qual_list)
final_qual_df.to_csv('qual_map.csv', index=False)

In [None]:
player_id_mapping.to_csv('player_id_mapping.csv')