In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
#Set unchanging variables for everything
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
large_schools = ['Alabama', 'Auburn', 'Florida', 'Georgia', 'LSU', 'Michigan', 'Notre Dame', 'Ohio State', 'Oklahoma', 'Oregon', 'Penn State', 'Texas', 'USC', 'Wisconsin']
medium_schools = ['Arizona', 'Arizona State', 'Arkansas', 'Boston College', 'BYU', 'California', 'Clemson', 'Colorado', 'Florida State', 'Georgia Tech', 'Iowa', 'Iowa State', 'Kansas', 'Kansas State', 'Kentucky', 'Louisville', 'Maryland', 'Miami', 'Michigan State', 'Minnesota', 'Mississippi State', 'Missouri', 'Nebraska', 'North Carolina', 'North Carolina State', 'Northwestern', 'Oklahoma State', 'Oregon State', 'Pittsburgh', 'Purdue', 'Rutgers', 'South Carolina', 'Stanford', 'Syracuse', 'TCU', 'Tennessee', 'Texas A&M', 'Texas Tech', 'UCLA', 'Utah', 'Virginia', 'Virginia Tech', 'Washington', 'Washington State', 'West Virginia']
small_schools = ['Air Force', 'Akron', 'Appalachian State', 'Arkansas State', 'Army', 'Ball State', 'Boise State', 'Bowling Green', 'Buffalo', 'Central Michigan', 'Charlotte', 'Cincinnati', 'Coastal Carolina', 'Colorado State', 'Connecticut', 'Duke', 'Eastern Michigan', 'East Carolina', 'Florida Atlantic', 'Florida International', 'Fresno State', 'Georgia Southern', 'Georgia State', 'Hawaii', 'Houston', 'Illinois', 'Indiana', 'Kent State', 'Liberty', 'Louisiana', 'Louisiana Tech', 'Louisiana-Monroe', 'Marshall', 'Memphis', 'Miami (OH)', 'Middle Tennessee', 'Navy', 'Nevada', 'New Mexico', 'New Mexico State', 'North Texas', 'Northern Illinois', 'Ohio', 'Old Dominion', 'Rice', 'San Diego State', 'San Jose State', 'SMU', 'South Alabama', 'South Florida', 'Southern Miss', 'Temple', 'Texas State', 'Toledo', 'Troy', 'Tulane', 'Tulsa', 'UAB', 'UCF', 'UNLV', 'UTEP', 'UTSA', 'UTah State', 'UTah State', 'Western Kentucky', 'Western Michigan', 'Wyoming']

all_schools = large_schools + medium_schools + small_schools


colleges = all_schools

In [None]:
#Load all college receiving data
#data/college_receiver_data/{college}/cfbd_receiving_data_{year}.csv')
#pandas dataframe for all college receiving data
all_college_receiving = pd.DataFrame()

for college in colleges:
    for year in years:
        #Check if file is empty
        try:
            temp_data = pd.read_csv(f'data/college_reciever_data/{college}/cfbd_recieving_data_{year}.csv')
        except:
            #print(f'No data for {college} {year}')
            continue
        #print(f'Loading data for {college} {year}')
        #load the data
        temp_data = pd.read_csv(f'data/college_reciever_data/{college}/cfbd_recieving_data_{year}.csv')
        #add the year and college columns
        temp_data['year'] = year
        temp_data['college'] = college
        #append the data to the all college receiving data
        all_college_receiving = pd.concat([all_college_receiving, temp_data])

all_college_receiving.head()


#Combine playerId rows, make statType columns
all_college_receiving = all_college_receiving.pivot_table(index=['playerId','player','college','conference','year'], columns='statType', values='stat').reset_index()
all_college_receiving.head()

#print(all_college_receiving.count())

#Change year to years in college and count how many years each player played
all_college_receiving['years_in_college'] = all_college_receiving.groupby('playerId')['year'].transform('count')
all_college_receiving.head()

#Sum LONG, REC, TD, YDS, YPR for each player, and keep all columns still in the dataframe
all_college_receiving['LONG'] = all_college_receiving['LONG'].fillna(0)
all_college_receiving['REC'] = all_college_receiving['REC'].fillna(0)
all_college_receiving['TD'] = all_college_receiving['TD'].fillna(0)
all_college_receiving['YDS'] = all_college_receiving['YDS'].fillna(0)
all_college_receiving['YPR'] = all_college_receiving['YPR'].fillna(0)
all_college_receiving['LONG'] = all_college_receiving['LONG'].astype(int)
all_college_receiving['REC'] = all_college_receiving['REC'].astype(int)
all_college_receiving['TD'] = all_college_receiving['TD'].astype(int)
all_college_receiving['YDS'] = all_college_receiving['YDS'].astype(int)
all_college_receiving['YPR'] = all_college_receiving['YPR'].astype(int)

all_college_receiving = all_college_receiving.groupby('playerId').agg({'playerId':'first','player':'first','college':'first', 'conference':'first','LONG':'sum','REC':'sum','TD':'sum','YDS':'sum','YPR':'sum','years_in_college':'max'}).reset_index(drop=True)

#change player to player_name
all_college_receiving = all_college_receiving.rename(columns={'player':'player_name'})

#Sort by highest YDS
all_college_receiving = all_college_receiving.sort_values(by='YDS', ascending=False).reset_index(drop=True)

all_college_receiving.head()
    

In [None]:
all_college_draft = pd.DataFrame()

for college in colleges:
    for year in years:
        #Check if file is empty
        try:
            temp_data = pd.read_csv(f'data/draft_data/{college}/cfbd_draft_data_{year}.csv')
        except:
            #print(f'No data for {college} {year}')
            continue
        #print(f'Loading data for {college} {year}')
        #load the data
        temp_data = pd.read_csv(f'data/draft_data/{college}/cfbd_draft_data_{year}.csv')
        #add the year and college columns
        temp_data['draftYear'] = year
        temp_data['college'] = college
        #append the data to the all college receiving data
        all_college_draft = pd.concat([all_college_draft, temp_data])

#Show player named Justin Jefferson
#all_college_draft[all_college_draft['name'] == 'Justin Jefferson']
#all_college_draft.count()
#Combine all_college_draft and all_college_receiving based on player and name, and playerId and collegeAthleteId
all_college_data = pd.merge(all_college_receiving, all_college_draft, left_on=['playerId','college'], right_on=['collegeAthleteId','college'], how='inner')

#Print all columns
#all_college_data.columns

#Move 'name' column to the front
all_college_data = all_college_data[['name', 'playerId','nflAthleteId', 'college', 'nflTeam', 'conference', 'LONG', 'REC', 'TD', 'YDS', 'YPR', 'years_in_college', 'draftYear', 'draftYear', 'position', 'height', 'weight', 'round', 'pick', 'overall', 'preDraftRanking','preDraftPositionRanking','preDraftGrade','hometownInfo']]
#print justin jefferson
#all_college_data[all_college_data['name'] == 'Justin Jefferson']
#Add column which states how big the school is
all_college_data['school_size'] = 'small'
all_college_data.loc[all_college_data['college'].isin(medium_schools), 'school_size'] = 'medium'
all_college_data.loc[all_college_data['college'].isin(large_schools), 'school_size'] = 'large'

#Move school_size column to be after college column
all_college_data = all_college_data[['name', 'playerId', 'nflAthleteId', 'college', 'school_size', 'nflTeam', 'conference', 'LONG', 'REC', 'TD', 'YDS', 'YPR', 'years_in_college', 'draftYear', 'draftYear', 'position', 'height', 'weight', 'round', 'pick', 'overall', 'preDraftRanking','preDraftPositionRanking','preDraftGrade','hometownInfo']]


all_college_data[all_college_data['name'] == 'Justin Jefferson']



In [None]:
#Graph number of wide receivers drafted from each school by school size
plt.figure(figsize=(15, 10))
sns.countplot(y='college', data=all_college_data, order = all_college_data['college'].value_counts().index, hue='school_size')
plt.title('Number of Drafted Wide Receiver Players from Each School by School Size')
plt.xlabel('Number of Players Drafted')
plt.ylabel('School')
plt.show()

#Graph pi chart of number of wide receivers drafted by school size
plt.figure(figsize=(15, 10))
all_college_data['school_size'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title('Percentage of Drafted Wide Receiver Players by School Size')
plt.ylabel('')
plt.show()


#Print the number of players drafted from each school size
print(all_college_data['school_size'].value_counts())



In [None]:
all_nfl_data = pd.DataFrame()

for year in years:
    temp_data = pd.read_csv(f'data/nfl_recieving_data/cfbd_recieving_data_{year}.csv')
    temp_data['year'] = year
    all_nfl_data = pd.concat([all_nfl_data, temp_data])



#Rename columns
all_nfl_data = all_nfl_data.rename(columns={'Player':'name', 'Rec':'nfl_REC', 'Yds':'nfl_YDS', 'TD':'nfl_TD', 'LNG':'nfl_LNG', 'Rec FUM':'nfl_Rec_FUM','Rec YAC/R':'nfl_Rec_YAC/R','Tgts':'nfl_TGTS', 'year':'nfl_Year'})

#combine rows with the same player name, and sum all other columns, change year to years in nfl
all_nfl_data = all_nfl_data.groupby('name').agg({'name':'first','nfl_REC':'sum','nfl_YDS':'sum','nfl_TD':'sum','nfl_LNG':'max','nfl_Rec_FUM':'sum','nfl_Rec_YAC/R':'mean','nfl_TGTS':'sum','nfl_Year':'count'}).reset_index(drop=True)

#Rename nfl_year to years_in_nfl
all_nfl_data = all_nfl_data.rename(columns={'nfl_Year':'years_in_nfl'})

#all_nfl_data.count()

In [None]:
#Combine all_nfl_data and all_college_data based on player_name
all_data = pd.merge(all_college_data, all_nfl_data, on='name', how='inner')

#Print all columns
all_data.columns

#Print everything for Justin Jefferson
all_data[all_data['name'] == 'Justin Jefferson']

all_data.count()


In [None]:
#Plot graph where x is years in nfl, y is nfl yards, green dots are large schools, blue dots are medium schools, red dots are small schools
plt.figure(figsize=(15, 10))
sns.scatterplot(x='years_in_nfl', y='nfl_YDS', data=all_data, hue='school_size')
plt.title('Years in NFL vs NFL Yards by School Size')
plt.xlabel('Years in NFL')
plt.ylabel('NFL Yards')
plt.show()



#print top 5 players with most nfl td
all_data_print = all_data.sort_values(by='nfl_YDS', ascending=False).reset_index(drop=True)
all_data_print.head()



In [None]:

#bar graph of number of players taken in each round by school size
plt.figure(figsize=(15, 10))
sns.countplot(x='round', data=all_data, hue='school_size')
plt.title('Number of Players Drafted in Each Round by School Size')
plt.xlabel('Round Drafted')
plt.ylabel('Number of Players Drafted')
plt.show()




In [None]:
#graph average amount of yards a player had in the nfl by school size
plt.figure(figsize=(15, 10))
sns.barplot(x='school_size', y='nfl_YDS', data=all_data)
plt.title('Average NFL Yards by School Size')
plt.xlabel('School Size')
plt.ylabel('Average NFL Yards')
plt.show()


In [None]:
#Plot graph where x is years in nfl, y is nfl td, green dots are large schools, blue dots are medium schools, red dots are small schools
plt.figure(figsize=(15, 10))
sns.scatterplot(x='years_in_nfl', y='nfl_TD', data=all_data, hue='school_size')
plt.title('Years in NFL vs NFL TD by School Size')
plt.xlabel('Years in NFL')
plt.ylabel('NFL TD')
plt.show()



#print top 5 players with most nfl td
all_data = all_data.sort_values(by='nfl_TD', ascending=False).reset_index(drop=True)
all_data.head()


In [None]:
#Plot graph where x is years in nfl, y is nfl rec, green dots are large schools, blue dots are medium schools, red dots are small schools
plt.figure(figsize=(15, 10))
sns.scatterplot(x='years_in_nfl', y='nfl_REC', data=all_data, hue='school_size')
plt.title('Years in NFL vs NFL REC by School Size')
plt.xlabel('Years in NFL')
plt.ylabel('NFL REC')
plt.show()




#print top 5 players with most nfl td
all_data = all_data.sort_values(by='nfl_REC', ascending=False).reset_index(drop=True)
all_data.head()

In [None]:
#Player Value=  (YRDS×w1​+YAC/R×w2​+TD×w3​+YEARS_PLAYED×w4​+REC×w5​​)/5
W1 = .3
W2 = 0.25
W3 = 0.2
W4 = 0.15
W5 = 0.1
W6 =-0.1

all_data['player_value'] = (all_data['nfl_YDS']*W1 + all_data['nfl_Rec_YAC/R']*W2 + all_data['nfl_TD']*W3 + all_data['years_in_nfl']*W4 + all_data['nfl_REC']*W5 + all_data['nfl_Rec_FUM']*W6)/5



#MAKE NUMBER OF LARGE, MEDIUM, SMALL CATEGORIES DATA ENTRIES THE SAME
#Get the number of each school size
large_count = all_data[all_data['school_size'] == 'large']['school_size'].count()
medium_count = all_data[all_data['school_size'] == 'medium']['school_size'].count()
small_count = all_data[all_data['school_size'] == 'small']['school_size'].count()

#Normalize the number of entries from each school size
large_data = all_data[all_data['school_size'] == 'large'].sample(n=small_count, replace=True)
medium_data = all_data[all_data['school_size'] == 'medium'].sample(n=small_count, replace=True)
small_data = all_data[all_data['school_size'] == 'small']

#Combine the data
all_data_test = pd.concat([large_data, medium_data, small_data])



small_value_avg = all_data_test[all_data_test['school_size'] == 'small']['player_value'].mean()
medium_value_avg = all_data_test[all_data_test['school_size'] == 'medium']['player_value'].mean()
large_value_avg = all_data_test[all_data_test['school_size'] == 'large']['player_value'].mean()

#Graph above average values
plt.figure(figsize=(15, 10))
sns.barplot(x=['small', 'medium', 'large'], y=[small_value_avg, medium_value_avg, large_value_avg])
plt.title('Average Player Value by School Size')
plt.xlabel('School Size')
plt.ylabel('Average Player Value')
plt.show()

#Print average player value by school size
print(all_data_test.groupby('school_size')['player_value'].mean())
#Print top 5 players with highest player value
all_data_print = all_data_test.sort_values(by='player_value', ascending=False).reset_index(drop=True)
all_data_print.head(20)


#Scatterplot of player value vs school they went to
plt.figure(figsize=(15, 10))
sns.scatterplot(x='player_value', y='college', data=all_data_test, hue='school_size')
plt.title('Player Value vs College')
plt.xlabel('Player Value')
plt.ylabel('College')
plt.show()



In [None]:
#Percentage wise how much better is a player from a large school compared to a small school
small_value_avg = all_data_test[all_data_test['school_size'] == 'small']['player_value'].mean()
large_value_avg = all_data_test[all_data_test['school_size'] == 'large']['player_value'].mean()
medium_value_avg = all_data_test[all_data_test['school_size'] == 'medium']['player_value'].mean()

print(f'Large schools are {((large_value_avg - small_value_avg)/small_value_avg)*100}% better than small schools')
print(f'Medium schools are {((medium_value_avg - small_value_avg)/small_value_avg)*100}% better than small schools')
print(f'Large schools are {((large_value_avg - medium_value_avg)/medium_value_avg)*100}% better than medium schools')


In [None]:
all_data.columns

In [None]:
#Train a model to predict school size based on player value


#PREPROCESSING  make school size a numerical value
all_data_test['school_size'] = all_data_test['school_size'].replace('small', 0)
all_data_test['school_size'] = all_data_test['school_size'].replace('medium', 1)
all_data_test['school_size'] = all_data_test['school_size'].replace('large', 2)


#Set X and y
features = ['player_value', 'nfl_YDS', 'nfl_Rec_YAC/R', 'nfl_TD', 'years_in_nfl', 'nfl_REC', 'nfl_Rec_FUM']
X = all_data_test[features]
y = all_data_test['school_size']

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

#Predict the test data
y_pred = model.predict(X_test)

#Print the accuracy
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')



