# <center>Тема 1. Exploratory data analysis with Pandas</center>

In [1]:
import os
import numpy as np
import pandas as pd

### Работа с Series

In [3]:
apples = pd.Series([10, 30, 20, 25, 4, 0], 
              index = ['Alise', 'Andrew', 'Bob', 'Matt', 'Charles', 'Ann']) 
apples                                                                 

Alise      10
Andrew     30
Bob        20
Matt       25
Charles     4
Ann         0
dtype: int64

In [4]:
apples[apples > 15]

Andrew    30
Bob       20
Matt      25
dtype: int64

In [7]:
(apples > 15)

Alise      False
Andrew      True
Bob         True
Matt        True
Charles    False
Ann        False
dtype: bool

In [4]:
apples.Andrew

30

In [5]:
apples['Carl'] = np.nan
apples

Alise      10.0
Andrew     30.0
Bob        20.0
Matt       25.0
Charles     4.0
Ann         0.0
Carl        NaN
dtype: float64

In [6]:
apples.fillna(apples.median(), inplace=True)
apples

Alise      10.0
Andrew     30.0
Bob        20.0
Matt       25.0
Charles     4.0
Ann         0.0
Carl       15.0
dtype: float64

In [7]:
apples = apples.fillna(apples.median())

In [8]:
apples.astype('Int64')

Alise      10
Andrew     30
Bob        20
Matt       25
Charles     4
Ann         0
Carl       15
dtype: Int64

In [9]:
apples[1]

30.0

In [10]:
apples[:3]

Alise     10.0
Andrew    30.0
Bob       20.0
dtype: float64

In [11]:
len(apples[apples > 15])

3

In [12]:
apples[apples > 15].shape

(3,)

### Работа с DataFrame

In [13]:
df1 = pd.DataFrame(np.random.randn(5, 3), 
                   index=['o1', 'o2', 'o3', 'o4', 'o5'], 
                   columns=['f1', 'f2', 'f3'])
df1

Unnamed: 0,f1,f2,f3
o1,-1.264638,1.117795,-0.954762
o2,-0.779466,1.614751,-1.996082
o3,0.742429,-2.338454,0.473055
o4,0.914521,0.714787,0.867325
o5,-1.746915,-1.350054,-1.717125


In [14]:
df2 = pd.DataFrame({'A': np.random.random(5), 
                    'B': ['a', 'b', 'c', 'd', 'e'], 
                    'C': np.arange(5) > 2})
df2

Unnamed: 0,A,B,C
0,0.818459,a,False
1,0.081877,b,False
2,0.281193,c,False
3,0.667704,d,True
4,0.581049,e,True


In [15]:
df2.loc[3, 'B']

'd'

In [16]:
df2.loc[:, :]

Unnamed: 0,A,B,C
0,0.818459,a,False
1,0.081877,b,False
2,0.281193,c,False
3,0.667704,d,True
4,0.581049,e,True


In [17]:
df2.at[2, 'B'] = 'F'
df2

Unnamed: 0,A,B,C
0,0.818459,a,False
1,0.081877,b,False
2,0.281193,F,False
3,0.667704,d,True
4,0.581049,e,True


In [18]:
df2.loc[[3, 4], 'B'] = 'Q'
df2

Unnamed: 0,A,B,C
0,0.818459,a,False
1,0.081877,b,False
2,0.281193,F,False
3,0.667704,Q,True
4,0.581049,Q,True


In [19]:
df2.loc[5] = 3.1415
df2

Unnamed: 0,A,B,C
0,0.818459,a,0.0
1,0.081877,b,0.0
2,0.281193,F,0.0
3,0.667704,Q,1.0
4,0.581049,Q,1.0
5,3.1415,3.1415,3.1415


In [20]:
df2.iloc[2, 0] = 14.31
df2

Unnamed: 0,A,B,C
0,0.818459,a,0.0
1,0.081877,b,0.0
2,14.31,F,0.0
3,0.667704,Q,1.0
4,0.581049,Q,1.0
5,3.1415,3.1415,3.1415


In [21]:
df2 = df2.rename({'B': 'BBBBB', 'C': 'CCCCC'}, axis=1)
df2

Unnamed: 0,A,BBBBB,CCCCC
0,0.818459,a,0.0
1,0.081877,b,0.0
2,14.31,F,0.0
3,0.667704,Q,1.0
4,0.581049,Q,1.0
5,3.1415,3.1415,3.1415


In [22]:
df2.columns = ['AAAAA', 'B', 'C']
df2

Unnamed: 0,AAAAA,B,C
0,0.818459,a,0.0
1,0.081877,b,0.0
2,14.31,F,0.0
3,0.667704,Q,1.0
4,0.581049,Q,1.0
5,3.1415,3.1415,3.1415


In [23]:
df1.columns = ['A', 'B', 'C']
df3 = df1.append(df2, sort=False)
df3

Unnamed: 0,A,B,C,AAAAA
o1,-1.264638,1.11779,-0.954762,
o2,-0.779466,1.61475,-1.996082,
o3,0.742429,-2.33845,0.473055,
o4,0.914521,0.714787,0.867325,
o5,-1.746915,-1.35005,-1.717125,
0,,a,0.0,0.818459
1,,b,0.0,0.081877
2,,F,0.0,14.31
3,,Q,1.0,0.667704
4,,Q,1.0,0.581049


In [24]:
df1.at['o2', 'A'] = np.nan
df1.at['o4', 'C'] = np.nan
df1

Unnamed: 0,A,B,C
o1,-1.264638,1.117795,-0.954762
o2,,1.614751,-1.996082
o3,0.742429,-2.338454,0.473055
o4,0.914521,0.714787,
o5,-1.746915,-1.350054,-1.717125


In [25]:
pd.isnull(df1)

Unnamed: 0,A,B,C
o1,False,False,False
o2,True,False,False
o3,False,False,False
o4,False,False,True
o5,False,False,False


In [26]:
df1.dropna(how='any')

Unnamed: 0,A,B,C
o1,-1.264638,1.117795,-0.954762
o3,0.742429,-2.338454,0.473055
o5,-1.746915,-1.350054,-1.717125


In [27]:
df1.fillna(0)

Unnamed: 0,A,B,C
o1,-1.264638,1.117795,-0.954762
o2,0.0,1.614751,-1.996082
o3,0.742429,-2.338454,0.473055
o4,0.914521,0.714787,0.0
o5,-1.746915,-1.350054,-1.717125


## Примеры анализа

In [28]:
filename = 'nba.csv'

if not os.path.exists(filename):
    tables = pd.read_html("http://www.basketball-reference.com/leagues/NBA_2016_games.html")
    games = tables[0]
    games.to_csv(filename)
else:
    games = pd.read_csv(filename)
games.head()

Unnamed: 0.1,Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,0,"Tue, Oct 27, 2015",8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,1,"Tue, Oct 27, 2015",8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,2,"Tue, Oct 27, 2015",10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,3,"Wed, Oct 28, 2015",7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,
4,4,"Wed, Oct 28, 2015",7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,


In [29]:
column_names = {'Date': 'date', 'Start (ET)': 'start',
                'Unamed: 2': 'box', 'Visitor/Neutral': 'away_team', 
                'PTS': 'away_points', 'Home/Neutral': 'home_team',
                'PTS.1': 'home_points', 'Unamed: 7': 'n_ot'}

games = games.rename(columns=column_names)
games.head()

Unnamed: 0.1,Unnamed: 0,date,start,away_team,away_points,home_team,home_points,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,0,"Tue, Oct 27, 2015",8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,1,"Tue, Oct 27, 2015",8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,2,"Tue, Oct 27, 2015",10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,3,"Wed, Oct 28, 2015",7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,
4,4,"Wed, Oct 28, 2015",7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,


In [30]:
games = games.dropna(thresh=4)[['date', 'away_team', 'away_points', 'home_team', 'home_points', 'Attend.']]
games.head()

Unnamed: 0,date,away_team,away_points,home_team,home_points,Attend.
0,"Tue, Oct 27, 2015",Detroit Pistons,106,Atlanta Hawks,94,19187
1,"Tue, Oct 27, 2015",Cleveland Cavaliers,95,Chicago Bulls,97,21957
2,"Tue, Oct 27, 2015",New Orleans Pelicans,95,Golden State Warriors,111,19596
3,"Wed, Oct 28, 2015",Washington Wizards,88,Orlando Magic,87,18846
4,"Wed, Oct 28, 2015",Indiana Pacers,99,Toronto Raptors,106,19800


In [31]:
games.date = pd.to_datetime(games.date, format='%a, %b %d, %Y')
games.head()

Unnamed: 0,date,away_team,away_points,home_team,home_points,Attend.
0,2015-10-27,Detroit Pistons,106,Atlanta Hawks,94,19187
1,2015-10-27,Cleveland Cavaliers,95,Chicago Bulls,97,21957
2,2015-10-27,New Orleans Pelicans,95,Golden State Warriors,111,19596
3,2015-10-28,Washington Wizards,88,Orlando Magic,87,18846
4,2015-10-28,Indiana Pacers,99,Toronto Raptors,106,19800


In [32]:
games.tail()

Unnamed: 0,date,away_team,away_points,home_team,home_points,Attend.
33,2015-10-31,New York Knicks,117,Washington Wizards,110,20356
34,2015-10-31,Golden State Warriors,134,New Orleans Pelicans,120,18406
35,2015-10-31,Brooklyn Nets,91,Memphis Grizzlies,101,16013
36,2015-10-31,Phoenix Suns,101,Portland Trail Blazers,90,17906
37,2015-10-31,Sacramento Kings,109,Los Angeles Clippers,114,19060


In [33]:
print(games.shape)
print(games.columns.values)

(38, 6)
['date' 'away_team' 'away_points' 'home_team' 'home_points' 'Attend.']


In [34]:
games.dtypes

date           datetime64[ns]
away_team              object
away_points             int64
home_team              object
home_points             int64
Attend.                 int64
dtype: object

In [35]:
games.describe()

Unnamed: 0,away_points,home_points,Attend.
count,38.0,38.0,38.0
mean,103.657895,101.263158,18274.763158
std,13.187374,13.951042,1547.135163
min,75.0,71.0,13858.0
25%,94.0,94.25,17678.0
50%,103.0,102.0,18323.0
75%,112.0,110.75,19155.25
max,139.0,136.0,21957.0


In [36]:
games.describe(include=['object', 'datetime64[ns]'])

Unnamed: 0,date,away_team,home_team
count,38,38,38
unique,5,26,26
top,2015-10-28 00:00:00,Utah Jazz,Memphis Grizzlies
freq,14,3,2
first,2015-10-27 00:00:00,,
last,2015-10-31 00:00:00,,


In [37]:
games.describe(percentiles=[0.1, 0.9, 0.9995])

Unnamed: 0,away_points,home_points,Attend.
count,38.0,38.0,38.0
mean,103.657895,101.263158,18274.763158
std,13.187374,13.951042,1547.135163
min,75.0,71.0,13858.0
10%,90.1,82.9,16639.1
50%,103.0,102.0,18323.0
90%,117.3,113.3,19803.6
99.95%,138.9075,135.926,21931.1925
max,139.0,136.0,21957.0


In [38]:
games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38 entries, 0 to 37
Data columns (total 6 columns):
date           38 non-null datetime64[ns]
away_team      38 non-null object
away_points    38 non-null int64
home_team      38 non-null object
home_points    38 non-null int64
Attend.        38 non-null int64
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 2.1+ KB


In [39]:
games.sort_values(by='away_points', ascending=False).head()

Unnamed: 0,date,away_team,away_points,home_team,home_points,Attend.
21,2015-10-30,Oklahoma City Thunder,139,Orlando Magic,136,18846
34,2015-10-31,Golden State Warriors,134,New Orleans Pelicans,120,18406
11,2015-10-28,New York Knicks,122,Milwaukee Bucks,97,18717
25,2015-10-30,Washington Wizards,118,Milwaukee Bucks,113,13858
33,2015-10-31,New York Knicks,117,Washington Wizards,110,20356


In [40]:
games.sort_values(by=['away_points', 'home_points'], ascending=[True, False]).head()

Unnamed: 0,date,away_team,away_points,home_team,home_points,Attend.
27,2015-10-30,Brooklyn Nets,75,San Antonio Spurs,102,18418
8,2015-10-28,Utah Jazz,87,Detroit Pistons,92,18434
19,2015-10-29,Dallas Mavericks,88,Los Angeles Clippers,104,19218
3,2015-10-28,Washington Wizards,88,Orlando Magic,87,18846
35,2015-10-31,Brooklyn Nets,91,Memphis Grizzlies,101,16013


**Какой средний балл у команд, которые играли дома?**

In [41]:
games['home_points'].mean()

101.26315789473684

**Какой максимальный балл у команд, которые играли на выезде 28 октября 2015?**

In [42]:
games[games['date'] == '2015-10-28'].head()

Unnamed: 0,date,away_team,away_points,home_team,home_points,Attend.
3,2015-10-28,Washington Wizards,88,Orlando Magic,87,18846
4,2015-10-28,Indiana Pacers,99,Toronto Raptors,106,19800
5,2015-10-28,Charlotte Hornets,94,Miami Heat,104,19724
6,2015-10-28,Chicago Bulls,115,Brooklyn Nets,100,17732
7,2015-10-28,Philadelphia 76ers,95,Boston Celtics,112,18624


In [43]:
games.loc[games['date'] == '2015-10-28', 'away_points'].max()

122

In [44]:
games.loc[games['date'] == '2015-10-28', 'away_points'].max() - \
games.loc[games['date'] == '2015-10-28', 'home_points'].max()

10

**Какой минимальный балл у команд, которые играли дома 30 октября 2015 года, и на матчах которых было более 18500 зрителей?**

In [45]:
games.loc[(games['date'] == '2015-10-30') & (games['Attend.'] > 18500), 'home_points'].min()

102

In [46]:
games[['away_points', 'home_points', 'Attend.']].apply(np.mean)

away_points      103.657895
home_points      101.263158
Attend.        18274.763158
dtype: float64

In [47]:
col = games['away_team']
col.str.split().head()

0          [Detroit, Pistons]
1      [Cleveland, Cavaliers]
2    [New, Orleans, Pelicans]
3       [Washington, Wizards]
4           [Indiana, Pacers]
Name: away_team, dtype: object

In [48]:
'FDSJFHDSJK DSFJKDSH KJGHJFDG'.split()[2]

'KJGHJFDG'

In [49]:
games.groupby('date', as_index=False).agg({'home_points': 'mean', 'away_points': 'mean',
                                           'home_team': 'nunique', 'away_team': 'nunique'})

Unnamed: 0,date,home_points,away_points,home_team,away_team
0,2015-10-27,100.666667,98.666667,3,3
1,2015-10-28,99.5,103.214286,14,14
2,2015-10-29,102.666667,104.0,3,3
3,2015-10-30,102.833333,103.083333,12,12
4,2015-10-31,101.833333,108.166667,6,6


In [50]:
g = games.groupby('date')
for dt, sub_df in g:
    print(f"{dt}: {sub_df['home_points'].apply(lambda x: x % 10).mean()}")

2015-10-27 00:00:00: 4.0
2015-10-28 00:00:00: 3.7857142857142856
2015-10-29 00:00:00: 2.6666666666666665
2015-10-30 00:00:00: 3.6666666666666665
2015-10-31 00:00:00: 1.8333333333333333


In [51]:
d1 = g.get_group('2015-10-29')
d1

Unnamed: 0,date,away_team,away_points,home_team,home_points,Attend.
17,2015-10-29,Memphis Grizzlies,112,Indiana Pacers,103,18165
18,2015-10-29,Atlanta Hawks,112,New York Knicks,101,19812
19,2015-10-29,Dallas Mavericks,88,Los Angeles Clippers,104,19218


In [52]:
games['date'].value_counts(sort=False)

2015-10-27     3
2015-10-28    14
2015-10-29     3
2015-10-30    12
2015-10-31     6
Name: date, dtype: int64

In [53]:
games[['home_points', 'away_points', 'Attend.']].corr()

Unnamed: 0,home_points,away_points,Attend.
home_points,1.0,0.416534,0.191471
away_points,0.416534,1.0,-0.065669
Attend.,0.191471,-0.065669,1.0


In [54]:
games[['home_points', 'away_points', 'Attend.']].cov()

Unnamed: 0,home_points,away_points,Attend.
home_points,194.631579,76.633001,4132.74
away_points,76.633001,173.906828,-1339.813
Attend.,4132.739687,-1339.812945,2393627.0


In [55]:
games['away_remainder'] = games['away_points'] % 5
games['home_remainder'] = games['home_points'] % 5
pd.crosstab(games['date'], games['away_remainder'])

away_remainder,0,1,2,3,4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-10-27,2,1,0,0,0
2015-10-28,3,4,3,1,3
2015-10-29,0,0,2,1,0
2015-10-30,2,0,3,2,5
2015-10-31,0,2,2,0,2


In [56]:
games.pivot_table(values='Attend.', index=['away_remainder', 'home_remainder'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Attend.
away_remainder,home_remainder,Unnamed: 2_level_1
0,0,17986.0
0,1,19596.0
0,2,19666.333333
0,3,17660.0
1,0,17980.5
1,1,17066.0
1,2,18203.0
1,4,18322.5
2,0,19205.5
2,1,17740.333333
