In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import datetime

In [3]:
df1 = pd.DataFrame({'one': [2,1,1,1], 'two': [1,3,2,4], 'three': [5,4,3,2]})
df1

Unnamed: 0,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [5]:
df1.sort_values(['one', 'two', 'three'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [7]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], dtype='string')
s[2] = np.nan
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2    <NA>
5    <NA>
dtype: string

In [36]:
df = pd.DataFrame([['D', 1, 2], 
                   ['a', 5, 6], 
                   ['B', 8, 9]], columns=list('aBc'))
df.sort_values('a')

Unnamed: 0,a,B,c
2,B,8,9
0,D,1,2
1,a,5,6


In [35]:
df.sort_values('a', key= lambda x:x.str.upper())

Unnamed: 0,a,B,c
1,a,5,6
2,B,8,9
0,D,1,2


In [44]:
s = np.random.randint(3, size =10)
s

array([2, 1, 2, 1, 0, 1, 2, 0, 0, 1])

In [50]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), ('b', 2), ('b', 1), ('b', 1)], names= ['fist', 'second'])
df = pd.DataFrame({'A': np.arange(6, 0, -1)}, index=idx)
df.sort_values(['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
fist,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5


In [53]:
df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1], 'b': list('abcdefg'), 'c': [1.0, 2.0, 4.0, 3.0, np.nan, 3.0, 4.0]})
df.nsmallest(3, ['a', 'c'])

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,g,4.0


In [2]:
df = pd.read_html('https://pandas.pydata.org/docs/user_guide/basics.html', match='String Aliases')

In [8]:
df1 = df[0]
df1

Unnamed: 0,Kind of Data,Data Type,Scalar,Array,String Aliases,Documentation
0,tz-aware datetime,DatetimeTZDtype,Timestamp,arrays.DatetimeArray,"'datetime64[ns, <tz>]'",Time zone handling
1,Categorical,CategoricalDtype,(none),Categorical,'category',Categorical data
2,period (time spans),PeriodDtype,Period,arrays.PeriodArray,"'period[<freq>]', 'Period[<freq>]'",Time span representation
3,sparse,SparseDtype,(none),arrays.SparseArray,"'Sparse', 'Sparse[int]', 'Sparse[float]'",Sparse data structures
4,intervals,IntervalDtype,Interval,arrays.IntervalArray,"'interval', 'Interval', 'Interval[<numpy_dtype...",IntervalIndex
5,nullable integer,"Int64Dtype, …",(none),arrays.IntegerArray,"'Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'U...",Nullable integer data type
6,nullable integer,"Int64Dtype, …",(none),arrays.IntegerArray,"'Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'U...",Nullable integer data type
7,Strings,StringDtype,str,arrays.StringArray,'string',Working with text data
8,Boolean (with NA),BooleanDtype,bool,arrays.BooleanArray,'boolean',Boolean data with missing values


In [25]:
import datetime
m = ['apple', datetime.datetime(2016, 3, 2)]
pd.to_datetime(m, errors='coerce')

DatetimeIndex(['NaT', '2016-03-02'], dtype='datetime64[ns]', freq=None)

In [26]:
pd.to_numeric(m, errors='coerce')

array([nan, nan])

In [37]:
df = pd.DataFrame(
    {
        "string": list("abc"),
        "int64": list(range(1, 4)),
        "uint8": np.arange(3, 6).astype("u1"),
        "float64": np.arange(4.0, 7.0),
        "bool1": [True, False, True],
        "bool2": [False, True, False],
        "dates": pd.date_range("now", periods=3),
        "category": pd.Series(list("ABC")).astype("category"),
    }
)
df['tdelta'] = df.dates.diff()
df['uint64'] = np.arange(3, 6).astype('u8')
df['other_dates'] = pd.date_range('20130101', periods=3)
df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,tdelta,uint64,other_dates,tz_aware_dates
0,a,1,3,4.0,True,False,2021-10-25 10:28:19.726682,A,NaT,3,2013-01-01,2013-01-01 00:00:00-05:00
1,b,2,4,5.0,False,True,2021-10-26 10:28:19.726682,B,1 days,4,2013-01-02,2013-01-02 00:00:00-05:00
2,c,3,5,6.0,True,False,2021-10-27 10:28:19.726682,C,1 days,5,2013-01-03,2013-01-03 00:00:00-05:00


In [40]:
df2 = pd.DataFrame({"dates": pd.date_range("now", periods=10)})
df2['diff'] = df2.dates.diff()
df2

Unnamed: 0,dates,diff
0,2021-10-25 10:30:04.906537,NaT
1,2021-10-26 10:30:04.906537,1 days
2,2021-10-27 10:30:04.906537,1 days
3,2021-10-28 10:30:04.906537,1 days
4,2021-10-29 10:30:04.906537,1 days
5,2021-10-30 10:30:04.906537,1 days
6,2021-10-31 10:30:04.906537,1 days
7,2021-11-01 10:30:04.906537,1 days
8,2021-11-02 10:30:04.906537,1 days
9,2021-11-03 10:30:04.906537,1 days


In [3]:
df = pd.read_html('https://pandas.pydata.org/docs/user_guide/io.html')

In [5]:
df[0].to_csv('pd_methods.csv')

In [6]:
from io import StringIO
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
df = pd.read_csv(StringIO(data))
df

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [9]:
df1 = pd.read_csv(StringIO(data), skiprows=lambda x: x%2 !=0)
df1

Unnamed: 0,col1,col2,col3
0,a,b,2


In [13]:
col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000))
df = pd.DataFrame({'col_1': col_1})
df.to_csv('foo.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
mixed_df = pd.read_csv('foo.csv')

In [15]:
mixed_df.dtypes

Unnamed: 0     int64
col_1         object
dtype: object

In [21]:
mixed_df['col_1'].apply(type).value_counts()

<class 'int'>    737858
<class 'str'>    262144
Name: col_1, dtype: int64

In [21]:
from io import StringIO
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
df = pd.read_csv(StringIO(data), dtype='category')
df

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [33]:
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [34]:
from pandas.api.types import CategoricalDtype
dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [37]:
data = 'skip this skip this\n continue skipping\n1,2,3\n4,5,6'
pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], skiprows=[0,1])

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6


In [5]:
data = 'a,b,a\n0,1,2\n3,4,5'
df = pd.read_csv(StringIO(data))
del df['a.1']
df

Unnamed: 0,a,b
0,0,1
1,3,4


In [6]:
data = 'na,b,c\n#commented line\n1,2,3\n4,5,6'
df = pd.read_csv(StringIO(data), comment='#')
df

Unnamed: 0,na,b,c
0,1,2,3
1,4,5,6


In [12]:
data = (
    "# empty\n"
    "# second empty line\n"
    "# third emptyline\n"
    "X,Y,Z\n"
    "1,2,3\n"
    "A,B,C\n"
    "1,2.,4.\n"
    "5.,NaN,10.0\n"
)
pd.read_csv(StringIO(data), comment='#')

Unnamed: 0,X,Y,Z
0,1,2,3
1,A,B,C
2,1,2.,4.
3,5.,,10.0


In [16]:
data = 'a,b,c\n4, apple, bat, \n8, orange, cow, '
# pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=False)
pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0)

Unnamed: 0,b,c
4,bat,
8,cow,


In [35]:
# print(open('tmp.csv').read())
pd.read_csv('tmp.csv', header=None, parse_dates={'norminal': [1,2], 'actual': [1,3]})

Unnamed: 0,norminal,actual,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [53]:
content = """
a
2000-01-01T00:00:00+05:00
2000-01-01T00:00:00+06:00"""
pd.read_csv(StringIO(content), parse_dates=['a'])

Unnamed: 0,a
0,2000-01-01 00:00:00+05:00
1,2000-01-01 00:00:00+06:00


In [58]:
pd.read_csv(StringIO(content), parse_dates=['a'], date_parser=lambda x: pd.to_datetime(x, utc=True))

Unnamed: 0,a
0,1999-12-31 19:00:00+00:00
1,1999-12-31 18:00:00+00:00


In [61]:
s = """
date,A,B,C   
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5
"""
pd.read_csv(StringIO(s), index_col=0, infer_datetime_format=True)

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [79]:
s = """
date,value,cat
1/6/2000,5,a
2/6/2000,10,b
3/6/2000,15,c
"""
pd.read_csv(StringIO(s), parse_dates=[0])

Unnamed: 0,date,value,cat
0,2000-01-06,5,a
1,2000-02-06,10,b
2,2000-03-06,15,c


In [80]:
pd.read_csv(StringIO(s), parse_dates=[0], dayfirst=True)

Unnamed: 0,date,value,cat
0,2000-06-01,5,a
1,2000-06-02,10,b
2,2000-06-03,15,c


In [86]:
val = "0.3066101993807095471566981359501369297504425048828125"
data = 'a,b,c\n1,2,{0}'.format(val)
float(val)

0.30661019938070955

In [85]:
abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0])

0.3066101993807095

In [88]:
abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0])

0.3066101993807095

In [87]:
abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0])

0.30661019938070955

In [89]:
data = """  ID|level|category
                        Patient1|123,000|x
                        Patient2|23,000|y
                        Patient3|1,234,018|z"""
pd.read_csv(StringIO(data), sep='|')

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [90]:
pd.read_csv(StringIO(data), sep='|', thousands=',')

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [94]:
s = """level
Patient1,123000
Patient2,23000
Patient3,1234018"""
print(s)

level
Patient1,123000
Patient2,23000
Patient3,1234018


In [98]:
out = pd.read_csv(StringIO(s), squeeze=True)
out

Patient1     123000
Patient2      23000
Patient3    1234018
Name: level, dtype: int64

In [101]:
data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10'
pd.read_csv(StringIO(data), on_bad_lines='warn')

b'Skipping line 3: expected 3 fields, saw 4\n'


Unnamed: 0,a,b,c
0,1,2,3
1,8,9,10


In [102]:
pd.read_csv(StringIO(data), usecols=[0,1,2])

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,8,9,10


In [148]:
data = 'a, b, c\n1, 2, 3\n4, 5, 6\n7,8,9\n10,11,12'
colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
widths = [6, 14, 13, 10]

In [2]:
foo = """A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5"""
pd.read_csv(StringIO(foo), parse_dates=True)

Unnamed: 0,A,B,C
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [153]:
pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0)

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
",a, b,",,,
012,,,
145,,,
278,,,
3101,12.0,,


In [4]:
print(open('mindex_ex.csv').read())

year,indiv,zit,xit
1977,"A",1.2,.6
1977,"B",1.5,.5
1977,"C",1.7,.8
1978,"A",.2,.06
1978,"B",.7,.2
1978,"C",.8,.3
1978,"D",.9,.5
1978,"E",1.4,.9
1979,"C",.2,.15
1979,"D",.14,.05
1979,"E",.5,.15
1979,"F",1.2,.5
1979,"G",3.4,1.9
1979,"H",5.4,2.7
1979,"I",6.4,1.2


In [7]:
df = pd.read_csv('mindex_ex.csv', index_col=[0,1])

In [9]:
df.loc[1978]

Unnamed: 0_level_0,zit,xit
indiv,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.2,0.06
B,0.7,0.2
C,0.8,0.3
D,0.9,0.5
E,1.4,0.9


In [10]:
print(open('mi2.csv').read())

,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [13]:
pd.read_csv('mi2.csv', header=[0,1], index_col=0)

Unnamed: 0_level_0,a,a,a,b,c,c
Unnamed: 0_level_1,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [17]:
pd.read_csv('tmp2.csv', engine='python', sep=None, index_col=0)

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.27186
4,-0.424972,0.56702,0.276232,-1.087401
5,-0.67369,0.113648,-1.478427,0.524988
6,0.404705,0.577046,-1.715002,-1.039268
7,-0.370647,-1.157892,-1.344312,0.844885
8,1.07577,-0.10905,1.643563,-1.469388
9,0.357021,-0.6746,-1.776904,-0.968914


In [20]:
with pd.read_csv('tmp.sv', sep='|', chunksize=4) as reader:
    reader
    for chunk in reader:
        print(chunk)

   Unnamed: 0         0         1         2         3
0           0  0.469112 -0.282863 -1.509059 -1.135632
1           1  1.212112 -0.173215  0.119209 -1.044236
2           2 -0.861849 -2.104569 -0.494929  1.071804
3           3  0.721555 -0.706771 -1.039575  0.271860
   Unnamed: 0         0         1         2         3
4           4 -0.424972  0.567020  0.276232 -1.087401
5           5 -0.673690  0.113648 -1.478427  0.524988
6           6  0.404705  0.577046 -1.715002 -1.039268
7           7 -0.370647 -1.157892 -1.344312  0.844885
   Unnamed: 0         0        1         2         3
8           8  1.075770 -0.10905  1.643563 -1.469388
9           9  0.357021 -0.67460 -1.776904 -0.968914


In [26]:
with pd.read_csv('tmp.sv', sep='|', iterator=True) as reader:
    reader = reader.get_chunk(5)
print(reader)

   Unnamed: 0         0         1         2         3
0           0  0.469112 -0.282863 -1.509059 -1.135632
1           1  1.212112 -0.173215  0.119209 -1.044236
2           2 -0.861849 -2.104569 -0.494929  1.071804
3           3  0.721555 -0.706771 -1.039575  0.271860
4           4 -0.424972  0.567020  0.276232 -1.087401


In [29]:
df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', sep='\t')

In [30]:
df

Unnamed: 0,item_code,item_name,display_level,selectable,sort_sequence
0,AA0,All items - old base,0,T,2
1,AA0R,Purchasing power of the consumer dollar - old ...,0,T,399
2,SA0,All items,0,T,1
3,SA0E,Energy,1,T,374
4,SA0L1,All items less food,1,T,358
...,...,...,...,...,...
394,SS68023,Tax return preparation and other accounting fees,4,T,352
395,SSEA011,College textbooks,3,T,314
396,SSFV031A,Food at elementary and secondary schools,3,T,122
397,SSGE013,Infants' equipment,3,T,355
