# Stata Functions in Python

In [25]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas import Series, DataFrame, Panel, datetime
from datetime import timedelta
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
import io
import os
import random
import statsmodels.stats.api as sms
from warnings import warn
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5  # Change default plot size

## by groups:

In Pandas, groups can be created using a *groupby* object.

In [48]:
s1 = pd.Series([1, 1, 2, 2, 3, 3])
s2 = pd.Series([10, 11, 12, 20, 21, 22])
s3 = pd.Series([0, 1, 2, 3, 4, 5, 6])
df = pd.DataFrame({'v1': s1, 'v2': s2, 'v3': s3})
df

Unnamed: 0,v1,v2,v3
0,1.0,10.0,0
1,1.0,11.0,1
2,2.0,12.0,2
3,2.0,20.0,3
4,3.0,21.0,4
5,3.0,22.0,5
6,,,6


In [50]:
# Create a Groupby object
grouped = df.groupby('v1', as_index=False)  # Split the DF on its index (rows)
grouped.groups

{1.0: [0, 1], 2.0: [2, 3], 3.0: [4, 5]}

In [51]:
# Descriptive stats
grouped.describe()

Unnamed: 0,Unnamed: 1,v1,v2,v3
0,count,2,2.0,2.0
0,mean,1,10.5,0.5
0,std,0,0.707107,0.707107
0,min,1,10.0,0.0
0,25%,1,10.25,0.25
0,50%,1,10.5,0.5
0,75%,1,10.75,0.75
0,max,1,11.0,1.0
1,count,2,2.0,2.0
1,mean,2,16.0,2.5


In [53]:
# Find the smallest value in each group
df_smallest = grouped.min()
df_smallest.columns = ['v1', 'v2_smallest', 'v3_smallest']
df_smallest

Unnamed: 0,v1,v2_smallest,v3_smallest
0,1,10,0
1,2,12,2
2,3,21,4


In [54]:
# Loop through groups
for name, group in grouped:
    print(name)
    print(group)

1.0
   v1  v2  v3
0   1  10   0
1   1  11   1
2.0
   v1  v2  v3
2   2  12   2
3   2  20   3
3.0
   v1  v2  v3
4   3  21   4
5   3  22   5


In [56]:
# Select a particular group
grouped.get_group(3)

Unnamed: 0,v1,v2,v3
4,3,21,4
5,3,22,5


### Aggregation

Aggregation can be performed via .aggregate() or .agg() method.

In [58]:
grouped.agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,v2,v2,v2,v3,v3,v3
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,21,10.5,0.707107,1,0.5,0.707107
2,32,16.0,5.656854,5,2.5,0.707107
3,43,21.5,0.707107,9,4.5,0.707107


In [60]:
grouped['v3'].agg({'total': np.sum,
                   'mean': np.mean,
                   'stddev': np.std})

Unnamed: 0,v1,total,stddev,mean
0,1,1,0.707107,0.5
1,2,5,0.707107,2.5
2,3,9,0.707107,4.5
