# Stata Functions in Python

In [60]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas import Series, DataFrame, Panel, datetime
from datetime import timedelta
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
import io
import os
import random
import statsmodels.stats.api as sms
from warnings import warn
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5  # Change default plot size

# Simple Functions

In [83]:
df = pd.DataFrame(np.arange(25).reshape(5, 5), columns=['A', 'B', 'C', 'D', 'E'])
df

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


## rename var1 var2

To rename a variable in a DataFrame, call .rename(columns={'var1':'var2})

In [85]:
df.rename(columns={'E':'F'}, inplace=True)
df

Unnamed: 0,A,B,C,D,F
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


## drop if var==cond

To drop a number of rows based on certain conditions.
e.g., drop if $A<6 " |  " B>20$

In [61]:
df = pd.DataFrame(np.arange(25).reshape(5, 5), columns=['A', 'B', 'C', 'D', 'E'])
df

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [62]:
df.drop(df.index[(df.A<6) | (df.B>20)])

Unnamed: 0,A,B,C,D,E
2,10,11,12,13,14
3,15,16,17,18,19


## replace var = value if cond

To change the value of some rows if a condition is true:

df.ix[selection criteria, columns I want] = value

In [63]:
df.ix[df.A>10, 'A'] = 999
df

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,999,16,17,18,19
4,999,21,22,23,24


## sort by var1 var2

To sort according to multiple criteria:

df.sort_values(by=['var1', 'var2'], inplace=True)

In [64]:
df.sort_values(by=['A', 'B'], ascending=False)

Unnamed: 0,A,B,C,D,E
4,999,21,22,23,24
3,999,16,17,18,19
2,10,11,12,13,14
1,5,6,7,8,9
0,0,1,2,3,4


## Find the number of missing observations by columns

In [81]:
df.isnull().sum()

v1    1
v2    1
v3    0
dtype: int64

## by groups:

In Pandas, groups can be created using a *groupby* object.

In [65]:
s1 = pd.Series([1, 1, 2, 2, 3, 3])
s2 = pd.Series([10, 11, 12, 20, 21, 22])
s3 = pd.Series([0, 1, 2, 3, 4, 5, 6])
df = pd.DataFrame({'v1': s1, 'v2': s2, 'v3': s3})
df

Unnamed: 0,v1,v2,v3
0,1.0,10.0,0
1,1.0,11.0,1
2,2.0,12.0,2
3,2.0,20.0,3
4,3.0,21.0,4
5,3.0,22.0,5
6,,,6


In [66]:
# Create a Groupby object
grouped = df.groupby('v1', as_index=False)  # Split the DF on its index (rows)
grouped.groups

{1.0: [0, 1], 2.0: [2, 3], 3.0: [4, 5]}

In [67]:
# Descriptive stats
grouped.describe()

Unnamed: 0,Unnamed: 1,v1,v2,v3
0,count,2,2.0,2.0
0,mean,1,10.5,0.5
0,std,0,0.707107,0.707107
0,min,1,10.0,0.0
0,25%,1,10.25,0.25
0,50%,1,10.5,0.5
0,75%,1,10.75,0.75
0,max,1,11.0,1.0
1,count,2,2.0,2.0
1,mean,2,16.0,2.5


In [68]:
# Find the smallest value in each group
df_smallest = grouped.min()
df_smallest.columns = ['v1', 'v2_smallest', 'v3_smallest']
df_smallest

Unnamed: 0,v1,v2_smallest,v3_smallest
0,1,10,0
1,2,12,2
2,3,21,4


In [69]:
# Loop through groups
for name, group in grouped:
    print(name)
    print(group)

1.0
   v1  v2  v3
0   1  10   0
1   1  11   1
2.0
   v1  v2  v3
2   2  12   2
3   2  20   3
3.0
   v1  v2  v3
4   3  21   4
5   3  22   5


In [70]:
# Select a particular group
grouped.get_group(3)

Unnamed: 0,v1,v2,v3
4,3,21,4
5,3,22,5


### Aggregation

Aggregation can be performed via .aggregate() or .agg() method.

In [71]:
grouped.agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,v2,v2,v2,v3,v3,v3
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,21,10.5,0.707107,1,0.5,0.707107
2,32,16.0,5.656854,5,2.5,0.707107
3,43,21.5,0.707107,9,4.5,0.707107


In [72]:
grouped['v3'].agg({'total': np.sum,
                   'mean': np.mean,
                   'stddev': np.std})

Unnamed: 0,v1,mean,stddev,total
0,1,0.5,0.707107,1
1,2,2.5,0.707107,5
2,3,4.5,0.707107,9


### Play around with data within each group 

A lot of times you encounter group-specific calculations. In a simple settings where you only need to compute using the values from one of the columns, then grouped.transform(lambda x: (x - x.mean)/x.std()) could be a useful method.

In group-specific calculations that requires the input of multiple columns, grouped.apply() is the one that saves your day.

In [82]:
def add_v4(grp):
    grp['v4'] = grp['v2'].sum() - grp['v1']
    return grp

df.groupby(by='v1', as_index=False).apply(add_v4)

   v1  v2  v3  v4
0   1  10   0  20
1   1  11   1  20
   v1  v2  v3  v4
0   1  10   0  20
1   1  11   1  20
   v1  v2  v3  v4
2   2  12   2  30
3   2  20   3  30
   v1  v2  v3  v4
4   3  21   4  40
5   3  22   5  40


Unnamed: 0,v1,v2,v3,v4
0,1.0,10.0,0.0,20.0
1,1.0,11.0,1.0,20.0
2,2.0,12.0,2.0,30.0
3,2.0,20.0,3.0,30.0
4,3.0,21.0,4.0,40.0
5,3.0,22.0,5.0,40.0
6,,,,


## Looping over rows: forvalues x = 1/_N { ... }

To iterate over rows, there is a .iterrows() method in DataFrame:

for index, row in df.iterrows():

    print row['c1'], row['c2']


In [33]:
for index, row in df.iterrows():
    print(index, row['v1'], row['v3'])

0 1.0 0.0
1 1.0 1.0
2 2.0 2.0
3 2.0 3.0
4 3.0 4.0
5 3.0 5.0
6 nan 6.0


# Other issues

## Comparing with missing: if var == .

In Python, missing values are np.nan, and comparison with missing values always return False. The user needs to call np.isnan() if a comparison is needed.

In [76]:
print(df.v1.ix[6] is np.nan)
print(df.v1.ix[6] == np.nan)
print(np.isnan(df.v1.ix[6]))

False
False
True


## Show percentage completed...

A simple widget available in jupyter notebook that does the trick.

In [79]:
from ipywidgets import FloatProgress
from IPython.display import display
from time import sleep
f = FloatProgress(min=0, max=1)
display(f)
for i in np.arange(100):
   sleep(1.1)
   f.value = i/100