In [1]:
# Import libraries 
from sqlalchemy import create_engine 
from dateutil.relativedelta import relativedelta
from datetime import datetime 
import sqlalchemy as sql 
import pandas as pd 
import numpy as np
import itertools
import psycopg2 
import pymysql
import re 

Connection to BP data base established

In [2]:
# Create PostgreSQL engine
engine = create_engine('postgresql+psycopg2://stager:after!ngest@172.104.15.153:5432/postgres')

# Connect to engine 
conn = engine.connect()

In [3]:
# Obtain sample of ten players 
result = conn.execute("select * from euston.player")

In [4]:
# Convert sample to data frame, use keys as column names
df = pd.DataFrame(result)
df.columns = result.keys()
df

Unnamed: 0,bpid,ml_srv,agent,cots_lengthval,cots_details
0,31317,7.076,Matt Sosnick,1 year (2013),1 year (2013). Signed by Chicago Cubs as a fre...
1,16636,8.158,Jonathan Maurer,1 year/$3M (2011),1 year/$3M (2011). Signed by Texas as a free a...
2,31384,10.154,Greg Landry,"1 year/$10M (2014), 2015 option","1 year/$10M (2014), plus 2015 vesting option. ..."
3,43032,8.116,Ryan Ware,1 year (2014),1 year (2014). Signed by Washington as a free ...
4,37412,12.040,ESQ Agency,1 year (2019),1 year (2019). Re-signed by Oakland as a free ...
...,...,...,...,...,...
5880,108053,0.000,,1 year (2019),1 year (2019). Contract selected by NY Yankees...
5881,102207,0.000,,2020,2020. Contract selected by LA Dodgers 10/31/19...
5882,101184,0.000,,2020,2020. Contract selected by Milwaukee 11/2/19.;...
5883,102612,0.000,,2020,2020. Contract selected by Oakland 11/2/19.; A...


In [5]:
# Close connection
conn.close()

Dictionary with team names and abbreviations for data cleaning 

In [16]:
# Creates dictionary with team names for easy identification 
teams = {'Arizona':'ARI', 'ARZ':'ARI', 'Atlanta':'ATL', 'Baltimore':'BAL', 'Boston':'BOS', 'Chicago Cubs':'CHN', 'CHC':'CHN',
        'Chicago White Sox':'CHA', 'CHW':'CHA', 'CWS':'CHA', 'Cincinnati':'CIN', 'Cleveland':'CLE', 'Colorado':'COL', 'Detroit':'DET', 
        'DT':'DET', 'Florida':'MIA', 'FLA':'MIA', 'Houston':'HOU', 'Kansas City':'KCA', 'KC':'KCA', 
        'LA Angels':'LAA', 'LA Dodgers':'LAN', 'LAD':'LAN', 'Miami':'MIA', 'Milwaukee':'MIL', 'Minnesota':'MIN', 
        'Montreal':'MON', 'NY Mets':'NYN', 'NYM':'NYN', 'NY Yankees':'NYA', 'NYY':'NYA', 
        'Oakland':'OAK', 'Philadelphia':'PHI', 'Pittsburgh':'PIT', 'St. Louis':'SLN', 'STL':'SLN', 'San Diego':'SDN', 
        'SD':'SDN', 'San Francisco':'SFN', 'Seattle':'SEA', 'Tampa Bay':'TBA', 'TB':'TBA', 
        'Texas':'TEX', 'TX':'TEX', 'Toronto':'TOR', 'Washington':'WAS'}

Functions created that will clean the data 

In [17]:
def replace_team(txt):
    """ 
    Function that replaces team name with team abbreviation using key, value pairs in teams dictionary. 
    """
    rep = dict((re.escape(k), v) for k, v in teams.items())
    pattern = re.compile("|".join(rep.keys()))
    text = pattern.sub(lambda x: rep[re.escape(x.group(0))], txt)
    
    return text

In [18]:
def team_date_id(ls): 
    """
    Function that extracts team_ids and contract dates, returning them as a data frame  
    """
    team_id_pattern = '[A-Z]{3}'
    team_df = pd.DataFrame([re.findall(team_id_pattern, l) for l in ls])

    date_pattern = '(\d{1,}\/\d{1,}.?\d+)'
    date_df = pd.DataFrame([re.findall(date_pattern, l) for l in ls])
    
    base_years = '^\d{1,}'
    base_years_df = pd.DataFrame([re.findall(base_years, l) for l in ls])
    
    max_years = 'option'
    max_years_df = pd.DataFrame([len(re.findall('(plus \d{4} club|mutual|vesting|player option)', i)) for i in ls]) 
    
    first_season = '\(\d{2,}\-?\d{2,}?\)?'
    first_season_df = pd.DataFrame([re.findall(first_season, l) for l in ls])
    
    df = pd.concat([date_df, base_years_df, max_years_df, team_df, first_season_df], axis=1)
    df.columns = ['signed_date', 'duration_years_base', 'duration_years_max', 'signing_org', 'first_season']
      
    df.first_season.replace(regex=True, inplace=True, to_replace='\(|\-\d{2,}|\)', value=r'')
    for i in range(len(df)):
        try: 
            if len(df.loc[i, 'first_season'])==2: 
                df.loc[i, 'first_season'] = '20'+df.loc[i, 'first_season']
        except:
            pass 
    
    df['duration_years_max'] = df.duration_years_max.astype(int) + df.duration_years_base.astype(int)
    
    date_pattern1 = r'^\d{1,}\/\d{2}$'
    date_pattern2 = r'^\d{1,}\/\d{4}$'
    date_pattern3 = r'^\d{1,}\/\d{1,}\/\d{4}$'
    date_pattern4 = r'^\d{1,}\/\d{1,}\/\d{2}$'
    #date_pattern4 = r'^\d{1,}\/\d{1,}\s\d{2,}$'
    
    # Clean dates 
    for i in range(len(df)): 
        if re.match(date_pattern1, df.loc[i, 'signed_date']) != None: 
            df.loc[i, 'signed_date'] = datetime.strptime(df.loc[i, 'signed_date'].strip(), "%m/%y").strftime("%Y-%m-%d")
        elif re.match(date_pattern2, df.loc[i, 'signed_date']) != None:
            df.loc[i, 'signed_date'] = datetime.strptime(df.loc[i, 'signed_date'].strip(), "%m/%Y").strftime("%Y-%m-%d")
        elif re.match(date_pattern3, df.loc[i, 'signed_date']) != None:
            df.loc[i, 'signed_date'] = datetime.strptime(df.loc[i, 'signed_date'], "%m/%d/%Y").strftime("%Y-%m-%d")
        elif re.match(date_pattern4, df.loc[i, 'signed_date']) != None: 
            df.loc[i, 'signed_date'] = datetime.strptime(df.loc[i, 'signed_date'], "%m/%d/%y").strftime("%Y-%m-%d")
        else: 
            df.loc[i, 'signed_date'] = datetime.strptime(df.loc[i, 'signed_date'], "%m/%d %y").strftime("%Y-%m-%d")
            
    df['signed_date'] = pd.to_datetime(df.signed_date)
    df = df.drop_duplicates(subset='signed_date', keep='first')
    df = df.sort_values('signed_date', ascending=False).reset_index()
    
    for i in range(len(df)):
        if type(df.loc[i, 'first_season']==None):
            df.loc[i, 'first_season'] = df.loc[i, 'signed_date'].year
    
    return df

In [19]:
def terms(df, txt, pattern1, pattern2):
    """
    Function that takes as input a data frame, series, and three patterns and returns a data frame with the following information:
    * contract_id
    * player_id 
    * date contract was signed 
    * years contract covers
    * salary to be paid over contract
    * original organization
    * current organization 
    """ 
    # Store Baseball Prospectus player id 
    bpid = df['bpid']
    
    # Separate cots_details into different lines; splits must be by ';' to identify all team signings within a season. 
    contracts = df[txt].split(';')
    
    # Use team dictionary to clean team names 
    teamContracts = [replace_team(c) for c in contracts]
    
    # Identify terms and signing organization 
    contractTerms = [re.findall(pattern1, t, re.IGNORECASE) for t in teamContracts]
    signingOrg = [re.findall(pattern2, t, re.IGNORECASE) for t in teamContracts]

    # Use join to convert words to sentences 
    contractTerms = [[' '.join(i) for i in j] for j in contractTerms]
    signingOrg = [[' '.join(i) for i in j] for j in signingOrg]
    
    # Convert to data frames 
    contractTermsDf = pd.DataFrame(contractTerms)
    contractTermsDf.rename(columns={0:'contract'}, inplace=True)
    signingOrgDf = pd.DataFrame(signingOrg)
    signingOrgDf.rename(columns={0:'org1'}, inplace=True)

    # Concatenate data frames 
    contractsDf = pd.concat([contractTermsDf, signingOrgDf], axis=1)
    
    # If the length of contractsDf is greater or equal to 3, then the player has played for multiple orgs and these need to be moved within the data frame  
    if len(contractsDf.columns) >= 3:
        # Keep main contract and org
        contractsDfMain = contractsDf.loc[:, ['contract', 'org1']]
        # Initiate empty DataFrame to store every other org
        contractsDfOther = pd.DataFrame(columns=['contract', 'org1'])
    
        # Iterate over the remaining extra columns to get all other clubs within the data set 
        for i in range(2, len(contractsDf.columns)):
            cdo = pd.DataFrame(contractsDf.iloc[:, i])
            cdo.insert(0, 'contract', '1 year. ')
            cdo.columns = ['contract', 'org1']
            cdo.dropna(inplace=True)
            contractsDfOther = pd.concat([contractsDfOther, cdo], axis=0, ignore_index=True, sort=False)
        
        # Concatenate Main data with Complementary data back into a new data set
        contractsDfNew = pd.concat([contractsDfMain, contractsDfOther], axis=0, ignore_index=True, sort=False)
        contractsDfNew.dropna(inplace=True)
    else: 
        contractsDf['contract'] = contractsDf.fillna('1 year. ')
        contractsDfNew = contractsDf.dropna()
    
    
    # Create a new column with all contract and org information and turn into a list to run in teams_date_id()
    contractsDfNew['joined'] = contractsDfNew.contract + contractsDfNew.org1
    contractsList = contractsDfNew.joined.tolist()
    
    # Run contractsList through team_date_id function to get date contract was signed, base and max duration in years, and signing org. Then add player bpid. 
    contractData = team_date_id(contractsList)
    contractData.insert(0, "bpid", bpid)
    contractData['bpid'] = contractData.bpid.ffill()
    
    # Check to see if player retired. If player retired, then the date is used to calculate the duration_years_contract variable. Every other row uses the difference between
    # signing dates. 
    retirementPattern = 'Retired\s\d{1,}\/\d{1,}\/\d{2,}'
    retirementList = [re.findall(retirementPattern, i) for i in teamContracts]
    retirement = str([s for sublist in retirementList for s in sublist])
    retirementDate = re.findall(r'\d{1,}\/\d{1,}\/\d{2,}', retirement)
    
    # Get retirement date and convert to datetime
    if len(retirementDate) == 1:
        retirementDateStr = retirementDate[0]
        lastDate = datetime.strptime(retirementDateStr, "%m/%d/%y").strftime("%Y-%m-%d")
        lastDate = datetime.strptime(lastDate, "%Y-%m-%d")
    else: 
        lastDate = datetime.now()
    
    yearsAfterLastContract = lastDate.year - contractData.loc[0, 'signed_date'].year
    
    # Calculate duration_years_actual
    for i in range(len(contractData)): 
        if i == 0 & yearsAfterLastContract <= int(contractData.loc[0, 'duration_years_max']): 
            contractData.loc[i, 'duration_years_actual'] = contractData.loc[i, 'duration_years_max']
        elif i == 0 & yearsAfterLastContract > int(contractData.loc[0, 'duration_years_max']):
            contractData.loc[i, 'duration_years_actual'] = lastDate.year - contractData.loc[i, 'signed_date'].year
        else:  
            contractData.loc[i, 'duration_years_actual'] = contractData.loc[i-1, 'signed_date'].year - contractData.loc[i, 'signed_date'].year

        # Actual years cannot be less than 0
        if int(contractData.loc[i, 'duration_years_actual']) <= 0: 
            contractData.loc[i, 'duration_years_actual'] = 1
    
    contractData['duration_years_actual'] = contractData.duration_years_actual.astype(int)
    
    # Rearrange columns 
    contractData = contractData[['bpid', 'signed_date', 'duration_years_base', 'duration_years_max', 'duration_years_actual', 'signing_org', 'first_season']]
    
    return contractData

Following blocks break down the different part of the three previous functions to test them separately and debug them if necessary 

In [20]:
# Patterns for contract signings and contract base duration
contract_terms = '(\d{1,}\s[a-z]+)(\/?\$[0-9]+.?[0-9]+M|.?\s?M|\/?\$.M)?\s(\([0-9-]+\).?\s?)([A-Za-z0-9\s]+option\.\s)?'
contract_org = '(Signed|Re-signed|Renewed)\s(by|extension with)?\s?([A-Z]{3})?\s?(as a free agent)?\s?(from)?\s?([A-Z]{3})?\s?(\d{1,}\/\d{1,}.\d{2,}|\d{1,}\/\d{2,})'

In [21]:
txt = 'cots_details'
pattern1 = contract_terms
pattern2 = contract_org

In [22]:
df[df['bpid']=='31351']

Unnamed: 0,bpid,ml_srv,agent,cots_lengthval,cots_details
5,31351,12.0,Mike Milchin,2 years/$25M (2015-16),2 years/$25M (2015-16). Signed by Chicago Whit...


In [23]:
# Initiate a test data frame to write code over
test = df.iloc[5]
test

bpid                                                          31351
ml_srv                                                       12.000
agent                                                  Mike Milchin
cots_lengthval                               2 years/$25M (2015-16)
cots_details      2 years/$25M (2015-16). Signed by Chicago Whit...
Name: 5, dtype: object

In [24]:
# Separate cots_details into different lines; splits must be by ';' and '.' to identify all team signings within a season. 
contracts = test[txt].split(';')
#subContracts = [i.split('.') for i in contracts]
contracts

['2 years/$25M (2015-16). Signed by Chicago White Sox as a free agent 11/25/14. 15:$12M, 16:$13M. Retired 3/15/16.',
 ' 2 years/$24M (2013-14), plus 2015 mutual option. Re-signed by Washington as a free agent 1/8/13. 13:$10M, 14:$12M, 15:$15M mutual option, $2M buyout. Washington declined 2015 option 10/30/14.',
 ' 2 years/$16M (2011-12), plus 2013 option. Signed by Washington as a free agent 1/4/11. 11:$7M, 12:$8M, 13:$10M mutual option, $1M buyout. LaRoche declined 2013 option 11/1/12. Washington made qualifying offer for 2013 (1 year/$13.3M) 11/2/12.',
 ' 1 year/$6M (2010), plus 2011 mutual option. Signed by Arizona as a free agent 1/15/10. 10:$4.5M, 11:$7.5M mutual option, $1.5M buyout. 2011 option increases to $9.5M if traded. Arizona declined 2011 option 11/2/10.',
 ' 1 year/$7.05M (2009). Re-signed 1/20/09 (avoided arbitration). Acquired by Boston in trade from Pittsburgh 7/22/09. (Boston to pay all $3M in remaining 2009 salary.) Acquired by Atlanta in trade from Boston 7/31/09.

In [25]:
# Use team dictionary to clean team names 
teamContracts = [replace_team(c) for c in contracts]
teamContracts

['2 years/$25M (2015-16). Signed by CHA as a free agent 11/25/14. 15:$12M, 16:$13M. Retired 3/15/16.',
 ' 2 years/$24M (2013-14), plus 2015 mutual option. Re-signed by WAS as a free agent 1/8/13. 13:$10M, 14:$12M, 15:$15M mutual option, $2M buyout. WAS declined 2015 option 10/30/14.',
 ' 2 years/$16M (2011-12), plus 2013 option. Signed by WAS as a free agent 1/4/11. 11:$7M, 12:$8M, 13:$10M mutual option, $1M buyout. LaRoche declined 2013 option 11/1/12. WAS made qualifying offer for 2013 (1 year/$13.3M) 11/2/12.',
 ' 1 year/$6M (2010), plus 2011 mutual option. Signed by ARI as a free agent 1/15/10. 10:$4.5M, 11:$7.5M mutual option, $1.5M buyout. 2011 option increases to $9.5M if traded. ARI declined 2011 option 11/2/10.',
 ' 1 year/$7.05M (2009). Re-signed 1/20/09 (avoided arbitration). Acquired by BOS in trade from PIT 7/22/09. (BOS to pay all $3M in remaining 2009 salary.) Acquired by ATL in trade from BOS 7/31/09. Performance bonuses: $50,000 for 555, 590 PAs.',
 ' 1 year/$5M (2008)

In [26]:
# Identify terms and signing organization 
contractTerms = [re.findall(pattern1, t, re.IGNORECASE) for t in teamContracts]
signingOrg = [re.findall(pattern2, t, re.IGNORECASE) for t in teamContracts]

# Use join to convert words to sentences 
contractTerms = [[' '.join(i) for i in j] for j in contractTerms]
signingOrg = [[' '.join(i) for i in j] for j in signingOrg]

In [27]:
print(contractTerms)
print(signingOrg)

[['2 years /$25M (2015-16).  '], ['2 years /$24M (2013-14),  plus 2015 mutual option. '], ['2 years /$16M (2011-12),  plus 2013 option. '], ['1 year /$6M (2010),  plus 2011 mutual option. '], ['1 year /$7.05M (2009).  '], ['1 year /$5M (2008).  '], ['1 year /$3.2M (2007).  '], ['1 year /$0.42M (2006).  '], ['1 year /$0.3375M (2005).  '], ['1 year /$0.3M (2004).  '], []]
[['Signed by CHA as a free agent   11/25/14'], ['Re-signed by WAS as a free agent   1/8/13'], ['Signed by WAS as a free agent   1/4/11'], ['Signed by ARI as a free agent   1/15/10'], ['Re-signed      1/20/09'], ['Re-signed by PIT    1/15/08'], ['Signed by PIT    2/07'], ['Re-signed by ATL    3/06'], ['Re-signed by ATL    2/05'], [], []]


In [28]:
# Convert to data frames 
contractTermsDf = pd.DataFrame(contractTerms)
contractTermsDf.rename(columns={0:'contract'}, inplace=True)
signingOrgDf = pd.DataFrame(signingOrg)
signingOrgDf.rename(columns={0:'org1'}, inplace=True)

# Concatenate data frames 
contractsDf = pd.concat([contractTermsDf, signingOrgDf], axis=1)

In [29]:
contractsDf

Unnamed: 0,contract,org1
0,2 years /$25M (2015-16).,Signed by CHA as a free agent 11/25/14
1,"2 years /$24M (2013-14), plus 2015 mutual opt...",Re-signed by WAS as a free agent 1/8/13
2,"2 years /$16M (2011-12), plus 2013 option.",Signed by WAS as a free agent 1/4/11
3,"1 year /$6M (2010), plus 2011 mutual option.",Signed by ARI as a free agent 1/15/10
4,1 year /$7.05M (2009).,Re-signed 1/20/09
5,1 year /$5M (2008).,Re-signed by PIT 1/15/08
6,1 year /$3.2M (2007).,Signed by PIT 2/07
7,1 year /$0.42M (2006).,Re-signed by ATL 3/06
8,1 year /$0.3375M (2005).,Re-signed by ATL 2/05
9,1 year /$0.3M (2004).,


In [30]:
if len(contractsDf.columns) >= 3:
    # Keep main contract and org
    contractsDfMain = contractsDf.loc[:, ['contract', 'org1']]
    # Initiate empty DataFrame to store every other org
    contractsDfOther = pd.DataFrame(columns=['contract', 'org1'])

    for i in range(2, len(contractsDf.columns)):
        cdo = pd.DataFrame(contractsDf.iloc[:, i])
        cdo.insert(0, 'contract', '1 year. ')
        cdo.columns = ['contract', 'org1']
        cdo.dropna(inplace=True)
        contractsDfOther = pd.concat([contractsDfOther, cdo], axis=0, ignore_index=True, sort=False)

    contractsDfNew = pd.concat([contractsDfMain, contractsDfOther], axis=0, ignore_index=True, sort=False)
    contractsDfNew.dropna(inplace=True)
else: 
    contractsDf['contract'] = contractsDf.fillna('1 year. ')
    contractsDfNew = contractsDf.dropna()

In [31]:
contractsDfNew

Unnamed: 0,contract,org1
0,2 years /$25M (2015-16).,Signed by CHA as a free agent 11/25/14
1,"2 years /$24M (2013-14), plus 2015 mutual opt...",Re-signed by WAS as a free agent 1/8/13
2,"2 years /$16M (2011-12), plus 2013 option.",Signed by WAS as a free agent 1/4/11
3,"1 year /$6M (2010), plus 2011 mutual option.",Signed by ARI as a free agent 1/15/10
4,1 year /$7.05M (2009).,Re-signed 1/20/09
5,1 year /$5M (2008).,Re-signed by PIT 1/15/08
6,1 year /$3.2M (2007).,Signed by PIT 2/07
7,1 year /$0.42M (2006).,Re-signed by ATL 3/06
8,1 year /$0.3375M (2005).,Re-signed by ATL 2/05


In [32]:
contractsDfNew['joined'] = contractsDfNew.contract + contractsDfNew.org1
contractsList = contractsDfNew.joined.tolist()
contractsList

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


['2 years /$25M (2015-16).  Signed by CHA as a free agent   11/25/14',
 '2 years /$24M (2013-14),  plus 2015 mutual option. Re-signed by WAS as a free agent   1/8/13',
 '2 years /$16M (2011-12),  plus 2013 option. Signed by WAS as a free agent   1/4/11',
 '1 year /$6M (2010),  plus 2011 mutual option. Signed by ARI as a free agent   1/15/10',
 '1 year /$7.05M (2009).  Re-signed      1/20/09',
 '1 year /$5M (2008).  Re-signed by PIT    1/15/08',
 '1 year /$3.2M (2007).  Signed by PIT    2/07',
 '1 year /$0.42M (2006).  Re-signed by ATL    3/06',
 '1 year /$0.3375M (2005).  Re-signed by ATL    2/05']

In [33]:
tf = team_date_id(contractsList)
tf

Unnamed: 0,index,signed_date,duration_years_base,duration_years_max,signing_org,first_season
0,0,2014-11-25,2,2,CHA,2014
1,1,2013-01-08,2,3,WAS,2013
2,2,2011-01-04,2,2,WAS,2011
3,3,2010-01-15,1,2,ARI,2010
4,4,2009-01-20,1,1,,2009
5,5,2008-01-15,1,1,PIT,2008
6,6,2007-02-01,1,1,PIT,2007
7,7,2006-03-01,1,1,ATL,2006
8,8,2005-02-01,1,1,ATL,2005


In [34]:
for i in range(len(tf)):
    if type(tf.loc[i, 'first_season']==None):
        tf.loc[i, 'first_season'] = tf.loc[i, 'signed_date'].year
        
tf

Unnamed: 0,index,signed_date,duration_years_base,duration_years_max,signing_org,first_season
0,0,2014-11-25,2,2,CHA,2014
1,1,2013-01-08,2,3,WAS,2013
2,2,2011-01-04,2,2,WAS,2011
3,3,2010-01-15,1,2,ARI,2010
4,4,2009-01-20,1,1,,2009
5,5,2008-01-15,1,1,PIT,2008
6,6,2007-02-01,1,1,PIT,2007
7,7,2006-03-01,1,1,ATL,2006
8,8,2005-02-01,1,1,ATL,2005


In [35]:
tf.drop_duplicates(subset='signed_date', keep='first')

Unnamed: 0,index,signed_date,duration_years_base,duration_years_max,signing_org,first_season
0,0,2014-11-25,2,2,CHA,2014
1,1,2013-01-08,2,3,WAS,2013
2,2,2011-01-04,2,2,WAS,2011
3,3,2010-01-15,1,2,ARI,2010
4,4,2009-01-20,1,1,,2009
5,5,2008-01-15,1,1,PIT,2008
6,6,2007-02-01,1,1,PIT,2007
7,7,2006-03-01,1,1,ATL,2006
8,8,2005-02-01,1,1,ATL,2005


In [36]:
retirement_pattern = 'Retired\s\d{1,}?\/?\d{1,}\/\d{2,}'
retirement_list = [re.findall(retirement_pattern, i) for i in teamContracts]
retirement = str([s for sublist in retirement_list for s in sublist])
retirement_date = re.findall(r'\d{1,}?\/?\d{1,}\/\d{2,}', retirement)
len(retirement_date)

#retirement_date_str = retirement_date[0]

#clean_retirement_date = datetime.strptime(retirement_date_str, "%m/%d/%y").strftime("%Y-%m-%d")
#clean_retirement_date

1

In [37]:
all_transactions = pd.DataFrame([re.findall(r'[A-Z]{3}', i) for i in teamContracts])
all_transactions

Unnamed: 0,0,1,2,3,4
0,CHA,,,,
1,WAS,WAS,,,
2,WAS,WAS,,,
3,ARI,ARI,,,
4,BOS,PIT,BOS,ATL,BOS
5,PIT,,,,
6,PIT,ATL,PIT,,
7,ATL,,,,
8,ATL,,,,
9,,,,,


In [38]:
if len(all_transactions.columns) >= 2:
    # Keep main contract and org
    all_transactionsMain = all_transactions.iloc[:, 0]
    # Initiate empty DataFrame to store every other org
    all_transactionsOther = pd.DataFrame()

    for i in range(1, len(all_transactions.columns)):
        ato = pd.DataFrame(all_transactions.iloc[:, i])
        ato.insert(0, 'x', 'x')
        ato.columns = ['x', 0]
        ato.dropna(inplace=True)
        all_transactionsOther = pd.concat([all_transactionsOther, ato], axis=0, ignore_index=True, sort=False)

    all_transactionsNew = pd.DataFrame(pd.concat([all_transactionsMain, all_transactionsOther]).iloc[:, 0])
    all_transactionsNew.dropna(inplace=True)
else: 
    all_transactionsNew = all_transactions.dropna()
    
all_transactionsNew.rename(columns={0:'transaction'}, inplace=True)

In [39]:
all_transactionsNew

Unnamed: 0,transaction
0,CHA
1,WAS
2,WAS
3,ARI
4,BOS
5,PIT
6,PIT
7,ATL
8,ATL
10,ATL


In [40]:
transactions_clean = all_transactionsNew.transaction.str.split(' ', expand=True)
transactions_clean.rename(columns={0: 'team_id', 1:'date'}, inplace=True)
transactions_clean.reset_index(inplace=True)
transactions_clean.sort_values('date', ascending=False, inplace=True)

KeyError: 'date'

In [41]:
for i in range(len(transactions_clean)): 
    if len(transactions_clean.loc[i, 'date']) > 5: 
        transactions_clean.loc[i, 'first_season'] = datetime.strptime(transactions_clean.loc[i, 'date'], "%m/%d/%y").year
    else: 
        transactions_clean.loc[i, 'first_season'] = datetime.strptime(transactions_clean.loc[i, 'date'], "%m/%y").year

KeyError: 'date'

In [283]:
transactions_clean

Unnamed: 0,index,team_id,date,first_season
5,1,ARI,7/31/10,2010.0
6,2,BAL,4/30/13,2013.0
4,0,LAA,4/28/13,2013.0
0,0,WAS,3/24/14,2014.0
1,1,WAS,3/18/13,2013.0
2,3,ARI,12/30/08,2008.0
3,4,ARI,1/18/08,2008.0


In [284]:
tf2 = tf.merge(transactions_clean, on='first_season', how='left')

In [285]:
for i in range(len(tf2)): 
    if tf2.loc[i, 'signing_org'] == None: 
        tf2.loc[i, 'signing_org'] = tf2.loc[i, 'team_id']

In [287]:
tf3 = tf2.iloc[:, 1:5]
tf3.drop_duplicates()

Unnamed: 0,signed_date,duration_years_base,duration_years_max,signing_org
0,2014-03-25,1,1,TEX
1,2013-12-20,1,1,WAS
4,2013-06-09,1,1,BAL
7,2013-03-18,1,1,LAA
10,2013-02-05,1,1,WAS
13,2012-01-20,1,2,HOU
14,2008-12-30,3,4,ARI
16,2008-01-18,1,1,ARI
18,2006-03-01,1,1,
