In [6]:
import numpy as np
import pandas as pd

## Intro to pandas DataFrame iteration

In [2]:
baseball_df = pd.read_csv('baseball_stats.csv')
baseball_df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424


In [7]:
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2) 
    
win_perc = calc_win_perc(50, 100)
print(win_perc)

0.5


In [8]:
win_perc_list = []

for i in range(len(baseball_df)):
    row = baseball_df.iloc[i] 
    wins = row['W']   
    games_played = row['G'] 
    win_perc = calc_win_perc(wins, games_played)  
    win_perc_list.append(win_perc)
baseball_df['WP'] = win_perc_list

In [9]:
baseball_df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,0.58
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,0.57
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,0.43
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,0.38


### Iterating with .iterrows()
In the video, we discussed that .iterrows() returns each DataFrame row as a tuple of (index, pandas Series) pairs. But, what does this mean? Let's explore with a few coding exercises.

A pandas DataFrame has been loaded into your session called pit_df. This DataFrame contains the stats for the Major League Baseball team named the Pittsburgh Pirates (abbreviated as 'PIT') from the year 2008 to the year 2012. It has been printed into your console for convenience.

In [13]:
pit_df = baseball_df[baseball_df['Team'] == 'PIT']

In [14]:
# Iterate over pit_df and print each row
for row, i in pit_df.iterrows():
    print(row)

21
51
81
111
141
171
201
231
262
292
322
352
382
412
442
471
499
527
553
579
605
631
657
683
709
735
761
787
813
839
865
891
917
943
967
991
1015
1039
1063
1086
1108
1128
1148
1168
1188
1208
1228


In [16]:
# Iterate over pit_df and print each index variable, row, and row type
for i,row in pit_df.iterrows():
    print(i)
    print(row)
    print(type(row))

21
Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
WP               0.49
Name: 21, dtype: object
<class 'pandas.core.series.Series'>
51
Team              PIT
League             NL
Year             2011
RS                610
RA                712
W                  72
OBP             0.309
SLG             0.368
BA              0.244
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.338
OSLG            0.409
WP               0.44
Name: 51, dtype: object
<class 'pandas.core.series.Series'>
81
Team              PIT
League             NL
Year             2010
RS                587
RA                866
W                  57
OBP             0.304
SLG          

In [17]:
# Use one variable instead of two to store the result of .iterrows()
for row_tuple in pit_df.iterrows():
    print(row_tuple)

(21, Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
WP               0.49
Name: 21, dtype: object)
(51, Team              PIT
League             NL
Year             2011
RS                610
RA                712
W                  72
OBP             0.309
SLG             0.368
BA              0.244
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.338
OSLG            0.409
WP               0.44
Name: 51, dtype: object)
(81, Team              PIT
League             NL
Year             2010
RS                587
RA                866
W                  57
OBP             0.304
SLG             0.373
BA              0.242
Playoffs            0
RankSeason 

In [18]:
# Print the row and type of each row
for row_tuple in pit_df.iterrows():
    print(row_tuple)
    print(type(row_tuple))

(21, Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
WP               0.49
Name: 21, dtype: object)
<class 'tuple'>
(51, Team              PIT
League             NL
Year             2011
RS                610
RA                712
W                  72
OBP             0.309
SLG             0.368
BA              0.244
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.338
OSLG            0.409
WP               0.44
Name: 51, dtype: object)
<class 'tuple'>
(81, Team              PIT
League             NL
Year             2010
RS                587
RA                866
W                  57
OBP             0.304
SLG             0.373
BA              0.242
P

### Run differentials with .iterrows()
You've been hired by the San Francisco Giants as an analyst—congrats! The team's owner wants you to calculate a metric called the run differential for each season from the year 2008 to 2012. This metric is calculated by subtracting the total number of runs a team allowed in a season from the team's total number of runs scored in a season. 'RS' means runs scored and 'RA' means runs allowed.

In [20]:
giants_df = baseball_df[baseball_df['Team'] == 'SFG']

In [22]:
def calc_run_diff(runs_scored, runs_allowed):

    run_diff = runs_scored - runs_allowed

    return run_diff

In [24]:
# Create an empty list to store run differentials
run_diffs = []

# Write a for loop and collect runs allowed and runs scored for each row
for i,row in giants_df.iterrows():
    runs_scored = row['RS']
    runs_allowed = row['RA']
    
    # Use the provided function to calculate run_diff for each row
    run_diff = calc_run_diff(runs_scored, runs_allowed)
    
    # Append each run differential to the output list
    run_diffs.append(run_diff)

giants_df.loc[:,'RD'] = run_diffs
print(giants_df)

     Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  \
24    SFG     NL  2012  718  649   94  0.327  0.397  0.269         1   
54    SFG     NL  2011  570  578   86  0.303  0.368  0.242         0   
84    SFG     NL  2010  697  583   92  0.321  0.408  0.257         1   
114   SFG     NL  2009  657  611   88  0.309  0.389  0.257         0   
144   SFG     NL  2008  640  759   72  0.321  0.382  0.262         0   
174   SFG     NL  2007  683  720   71  0.322  0.387  0.254         0   
204   SFG     NL  2006  746  790   76  0.324  0.422  0.259         0   
234   SFG     NL  2005  649  745   75  0.319  0.396  0.261         0   
265   SFG     NL  2004  850  770   91  0.357  0.438  0.270         0   
295   SFG     NL  2003  755  638  100  0.338  0.425  0.264         1   
325   SFG     NL  2002  783  616   95  0.344  0.442  0.267         1   
355   SFG     NL  2001  799  748   90  0.342  0.460  0.266         0   
385   SFG     NL  2000  925  747   97  0.362  0.472  0.278      

## Another iterator method: .itertuples()

In [28]:
team_wins_df = baseball_df[['Team','Year','W']]
rangers_df = baseball_df[baseball_df['Team'] == 'TEX']

### Iterating with .itertuples()
Remember, .itertuples() returns each DataFrame row as a special data type called a namedtuple. You can look up an attribute within a namedtuple with a special syntax. Let's practice working with namedtuples.

A pandas DataFrame has been loaded into your session called rangers_df. This DataFrame contains the stats ('Team', 'League', 'Year', 'RS', 'RA', 'W', 'G', and 'Playoffs') for the Major League baseball team named the Texas Rangers (abbreviated as 'TEX').

In [29]:
# Loop over the DataFrame and print each row
for row in rangers_df.itertuples():
  print(row)

Pandas(Index=27, Team='TEX', League='AL', Year=2012, RS=808, RA=707, W=93, OBP=0.334, SLG=0.446, BA=0.273, Playoffs=1, RankSeason=5.0, RankPlayoffs=5.0, G=162, OOBP=0.309, OSLG=0.408, WP=0.57)
Pandas(Index=57, Team='TEX', League='AL', Year=2011, RS=855, RA=677, W=96, OBP=0.34, SLG=0.46, BA=0.283, Playoffs=1, RankSeason=3.0, RankPlayoffs=2.0, G=162, OOBP=0.307, OSLG=0.392, WP=0.59)
Pandas(Index=87, Team='TEX', League='AL', Year=2010, RS=787, RA=687, W=90, OBP=0.338, SLG=0.419, BA=0.276, Playoffs=1, RankSeason=7.0, RankPlayoffs=2.0, G=162, OOBP=0.319, OSLG=0.39, WP=0.56)
Pandas(Index=117, Team='TEX', League='AL', Year=2009, RS=784, RA=740, W=87, OBP=0.32, SLG=0.445, BA=0.26, Playoffs=0, RankSeason=nan, RankPlayoffs=nan, G=162, OOBP=0.331, OSLG=0.416, WP=0.54)
Pandas(Index=147, Team='TEX', League='AL', Year=2008, RS=901, RA=967, W=79, OBP=0.354, SLG=0.462, BA=0.283, Playoffs=0, RankSeason=nan, RankPlayoffs=nan, G=162, OOBP=0.362, OSLG=0.455, WP=0.49)
Pandas(Index=177, Team='TEX', League='

In [30]:
# Loop over the DataFrame and print each row's Index, Year and Wins (W)
for row in rangers_df.itertuples():
  i = row.Index
  year = row.Year
  wins = row.W
  print(i, year, wins)

27 2012 93
57 2011 96
87 2010 90
117 2009 87
147 2008 79
177 2007 75
207 2006 80
237 2005 79
268 2004 89
298 2003 71
328 2002 72
358 2001 73
388 2000 71
418 1999 95
448 1998 88
476 1997 77
504 1996 90
532 1993 86
558 1992 77
584 1991 85
610 1990 83
636 1989 83
662 1988 70
688 1987 75
714 1986 87
740 1985 62
766 1984 69
792 1983 77
818 1982 64
844 1980 76
870 1979 83
896 1978 87
922 1977 94
947 1976 76
971 1975 79
995 1974 83
1019 1973 57


In [31]:
# Loop over the DataFrame and print each row's Index, Year and Wins (W)
for row in rangers_df.itertuples():
  i = row.Index
  year = row.Year
  wins = row.W
  
  # Check if rangers made Playoffs (1 means yes; 0 means no)
  if row.Playoffs == 1:
    print(i, year, wins)

27 2012 93
57 2011 96
87 2010 90
418 1999 95
448 1998 88
504 1996 90


### Run differentials with .itertuples()
The New York Yankees have made a trade with the San Francisco Giants for your analyst contract— you're a hot commodity! Your new boss has seen your work with the Giants and now wants you to do something similar with the Yankees data. He'd like you to calculate run differentials for the Yankees from the year 1962 to the year 2012 and find which season they had the best run differential.

You've remembered the function you used when working with the Giants and quickly write it down:



In [32]:
def calc_run_diff(runs_scored, runs_allowed):

    run_diff = runs_scored - runs_allowed

    return run_diff

In [35]:
yankees_df = baseball_df[baseball_df['Team'] == 'NYY']

In [40]:
run_diffs = []

# Loop over the DataFrame and calculate each row's run differential
for row in yankees_df.itertuples():
    
    runs_scored = row.RS
    runs_allowed = row.RA

    run_diff = calc_run_diff(runs_scored, runs_allowed)
    
    run_diffs.append(run_diff)

# Append new column
yankees_df.loc[:,'RD'] = run_diffs
yankees_df

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP,RD
18,NYY,AL,2012,804,668,95,0.337,0.453,0.265,1,3.0,3.0,162,0.311,0.419,0.59,136
48,NYY,AL,2011,867,657,97,0.343,0.444,0.263,1,2.0,4.0,162,0.322,0.399,0.6,210
78,NYY,AL,2010,859,693,95,0.35,0.436,0.267,1,3.0,3.0,162,0.322,0.399,0.59,166
108,NYY,AL,2009,915,753,103,0.362,0.478,0.283,1,1.0,1.0,162,0.327,0.408,0.64,162
138,NYY,AL,2008,789,727,89,0.342,0.427,0.271,0,,,162,0.329,0.405,0.55,62
168,NYY,AL,2007,968,777,94,0.366,0.463,0.29,1,2.0,4.0,162,0.34,0.417,0.58,191
198,NYY,AL,2006,930,767,97,0.363,0.461,0.285,1,1.0,4.0,162,0.326,0.413,0.6,163
228,NYY,AL,2005,886,789,95,0.355,0.45,0.276,1,3.0,4.0,162,0.332,0.422,0.59,97
259,NYY,AL,2004,897,808,101,0.353,0.458,0.268,1,2.0,3.0,162,0.328,0.432,0.62,89
289,NYY,AL,2003,877,716,101,0.356,0.453,0.271,1,1.0,2.0,163,0.314,0.407,0.62,161


## pandas alternative to looping

### Analyzing baseball stats with .apply()
The Tampa Bay Rays want you to analyze their data.

They'd like the following metrics:

- The sum of each column in the data
- The total amount of runs scored in a year ('RS' + 'RA' for each year)
- The 'Playoffs' column in text format rather than using 1's and 0's
- The below function can be used to convert the 'Playoffs' column to text:

In [41]:
def text_playoffs(num_playoffs): 
    if num_playoffs == 1:
        return 'Yes'
    else:
        return 'No' 

In [45]:
rays_df = baseball_df[baseball_df['Team'] == 'TBR']

In [None]:
# Gather sum of all columns
stat_totals = rays_df.apply(sum, axis=1)
print(stat_totals)

In [47]:
# Gather total runs scored in all games per year
total_runs_scored = rays_df[['RS', 'RA']].apply(sum, axis=1)
print(total_runs_scored)

26     1274
56     1321
86     1451
116    1557
146    1445
dtype: int64


In [49]:
# Convert numeric playoffs to text by applying text_playoffs()
textual_playoffs = rays_df.apply(lambda row: text_playoffs(row['Playoffs']), axis=1)
print(textual_playoffs)

26      No
56     Yes
86     Yes
116     No
146    Yes
dtype: object


### Settle a debate with .apply()
Word has gotten to the Arizona Diamondbacks about your awesome analytics skills. They'd like for you to help settle a debate amongst the managers. One manager claims that the team has made the playoffs every year they have had a win percentage of 0.50 or greater. Another manager says this is not true.

Let's use the below function and the .apply() method to see which manager is correct.

In [50]:
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

In [52]:
dbacks_df = baseball_df[baseball_df['Team'] ==  'ARI']

In [55]:
# Display the first five rows of the DataFrame
display(dbacks_df.head())

# Create a win percentage Series 
win_percs = dbacks_df.apply(lambda row: calc_win_perc(row['W'], row['G']), axis=1)
print(win_percs, '\n')

# Append a new column to dbacks_df
dbacks_df.loc[:,'WP'] = win_percs
display(dbacks_df, '\n')

# Display dbacks_df where WP is greater than 0.50
display(dbacks_df[dbacks_df['WP'] >= 0.50])

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5
30,ARI,NL,2011,731,662,94,0.322,0.413,0.25,1,5.0,4.0,162,0.316,0.409,0.58
60,ARI,NL,2010,713,836,65,0.325,0.416,0.25,0,,,162,0.34,0.448,0.4
90,ARI,NL,2009,720,782,70,0.324,0.418,0.253,0,,,162,0.33,0.419,0.43
120,ARI,NL,2008,720,706,82,0.327,0.415,0.251,0,,,162,0.318,0.398,0.51


0      0.50
30     0.58
60     0.40
90     0.43
120    0.51
150    0.56
180    0.47
210    0.48
241    0.31
271    0.52
301    0.60
331    0.57
361    0.52
391    0.62
421    0.40
dtype: float64 



Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5
30,ARI,NL,2011,731,662,94,0.322,0.413,0.25,1,5.0,4.0,162,0.316,0.409,0.58
60,ARI,NL,2010,713,836,65,0.325,0.416,0.25,0,,,162,0.34,0.448,0.4
90,ARI,NL,2009,720,782,70,0.324,0.418,0.253,0,,,162,0.33,0.419,0.43
120,ARI,NL,2008,720,706,82,0.327,0.415,0.251,0,,,162,0.318,0.398,0.51
150,ARI,NL,2007,712,732,90,0.321,0.413,0.25,1,3.0,3.0,162,0.334,0.42,0.56
180,ARI,NL,2006,773,788,76,0.331,0.424,0.267,0,,,162,0.335,0.431,0.47
210,ARI,NL,2005,696,856,77,0.332,0.421,0.256,0,,,162,0.345,0.455,0.48
241,ARI,NL,2004,615,899,51,0.31,0.393,0.253,0,,,162,0.35,0.439,0.31
271,ARI,NL,2003,717,685,84,0.33,0.417,0.263,0,,,162,0.322,0.388,0.52


'\n'

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5
30,ARI,NL,2011,731,662,94,0.322,0.413,0.25,1,5.0,4.0,162,0.316,0.409,0.58
120,ARI,NL,2008,720,706,82,0.327,0.415,0.251,0,,,162,0.318,0.398,0.51
150,ARI,NL,2007,712,732,90,0.321,0.413,0.25,1,3.0,3.0,162,0.334,0.42,0.56
271,ARI,NL,2003,717,685,84,0.33,0.417,0.263,0,,,162,0.322,0.388,0.52
301,ARI,NL,2002,819,674,98,0.346,0.423,0.267,1,4.0,4.0,162,0.305,0.397,0.6
331,ARI,NL,2001,818,677,92,0.341,0.442,0.267,1,5.0,1.0,162,0.311,0.404,0.57
361,ARI,NL,2000,792,754,85,0.333,0.429,0.265,0,,,162,0.326,0.424,0.52
391,ARI,NL,1999,908,676,100,0.347,0.459,0.277,1,2.0,4.0,162,0.32,0.402,0.62


## Optimal pandas iterating

### Replacing .iloc with underlying arrays
Now that you have a better grasp on a DataFrame's internals let's update one of your previous analyses to leverage a DataFrame's underlying arrays. You'll revisit the win percentage calculations you performed row by row with the .iloc method:

In [56]:
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

win_percs_list = []

for i in range(len(baseball_df)):
    row = baseball_df.iloc[i]

    wins = row['W']
    games_played = row['G']

    win_perc = calc_win_perc(wins, games_played)

    win_percs_list.append(win_perc)

baseball_df['WP'] = win_percs_list

In [57]:
# Use the W array and G array to calculate win percentages
win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values) 

# Append a new column to baseball_df that stores all win percentages
baseball_df['WP'] = win_percs_np

display(baseball_df.head())

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,0.58
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,0.57
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,0.43
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,0.38


### Bringing it all together: Predict win percentage
A pandas DataFrame (baseball_df) has been loaded into your session. For convenience, a dictionary describing each column within baseball_df has been printed into your console. You can reference these descriptions throughout the exercise.

You'd like to attempt to predict a team's win percentage for a given season by using the team's total runs scored in a season ('RS') and total runs allowed in a season ('RA') with the following function:

In [58]:
def predict_win_perc(RS, RA):
    prediction = RS ** 2 / (RS ** 2 + RA ** 2)
    return np.round(prediction, 2)

In [62]:
win_perc_preds_loop = []

# Use a loop and .itertuples() to collect each row's predicted win percentage
for row in baseball_df.itertuples():
    runs_scored = row.RS
    runs_allowed = row.RA
    win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
    win_perc_preds_loop.append(win_perc_pred)

# Apply predict_win_perc to each row of the DataFrame
win_perc_preds_apply = baseball_df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)

# Calculate the win percentage predictions using NumPy arrays
win_perc_preds_np = predict_win_perc(baseball_df['RS'].values, baseball_df['RA'].values)
baseball_df.loc[:,'WP_Preds'] = win_perc_preds_np
baseball_df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP,WP_preds,WP_Preds
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5,0.53,0.53
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,0.58,0.58,0.58
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,0.57,0.5,0.5
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,0.43,0.45,0.45
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,0.38,0.39,0.39
