In [None]:
import pandas as pd

# Data Input

https://pandas.pydata.org/pandas-docs/stable/reference/io.html

### Basic Input Methods

In [None]:
import os

if not os.getcwd().endswith("Pandas 1 - Data Inputs"):
    os.chdir(f"{os.getcwd()}/Pandas 1 - Data Inputs")

## csv
csv = pd.read_csv("test.csv",
                    sep=',',
                    delimiter=None,
                    header='infer',
                    names=None,
                    index_col=None
                )

## Same just using read_table
csv_rt = pd.read_table("test.csv",
                        sep=',',
                        delimiter=None,
                        header='infer',
                        names=None,
                        index_col=None
                                 )

## Tab delimited Text File
tab_txt = pd.read_table("test.txt",
                        sep='\t',
                        delimiter=None,
                        header='infer',
                        names=None,
                        index_col=None
                                 )
# Single Xslx tab
xlsx = pd.read_excel("test.xlsx",
                        sheet_name=0,
                        header=0,
                        names=None,
                        index_col=None,
                    )



### ExcelFile Object

In [144]:
xlsx1 = pd.ExcelFile("test1.xlsx")

sheets = xlsx1.sheet_names

for i in sheets:
    print(xlsx1.parse(sheet_name=i))

  col 1  col 2
0     a      1
  col3  col4
0    z    26


In [104]:
## Others
pd.read_clipboard() ## Last copied item on clipboard

'/Users/niall.whelan/Documents/Python/Python - Recap Training/Data Wrangling & Visualisation'

# Series

https://pandas.pydata.org/pandas-docs/stable/reference/series.html

1 Dimensional vector with axis labels

In [53]:
s = pd.Series(data = ['Niall', 'Anja', 'Raj', 'Ed'],
              index = ['SR' , 'JR', 'JR', 'SR'],
              #index = range(4)
             )


s_num = pd.Series(data = [100, 70, 70, 100],
              index = range(4)
             )

print(f"Full Series: \n\n {s} \n\n")
print(s.iloc[2])

Full Series: 

 SR    Niall
JR     Anja
JR      Raj
SR       Ed
dtype: object 


Raj


In [59]:
## Describe
print(s_num.describe())

count      4.000000
mean      85.000000
std       17.320508
min       70.000000
25%       70.000000
50%       85.000000
75%      100.000000
max      100.000000
dtype: float64


In [71]:
## STD
print ((((s_num - s_num.mean())**2).mean())**0.5)
print(s_num.std(ddof = 0)) ## Makes it N-0 elements (Default = 1)

15.0
15.0


In [83]:
## Apply
def func(x):
    return x**2

s_num.apply(func)

0    10000
1     4900
2     4900
3    10000
dtype: int64

In [145]:
## Filtering

s_num[s_num > 90].values
s_num[s_num > 90].index

Int64Index([0, 3], dtype='int64')

# DataFrame
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

In [177]:
## Create from dictionary
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [179]:
df.dtypes

col1    int64
col2    int64
dtype: object

In [178]:
df.shape

(2, 2)

In [182]:
df.describe()

Unnamed: 0,col1,col2
count,2.0,2.0
mean,1.5,3.5
std,0.707107,0.707107
min,1.0,3.0
25%,1.25,3.25
50%,1.5,3.5
75%,1.75,3.75
max,2.0,4.0


In [183]:
df.corr()

Unnamed: 0,col1,col2
col1,1.0,1.0
col2,1.0,1.0


### .loc &  .iloc (Accessing rows and collumns)

In [236]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c', 3: 'c'},
                   'B': {0: 1, 1: 3, 2: 5, 3: 5},
                   'C': {0: 2, 1: 4, 2: 6, 3: 8}})
print(df)

print('\n-------\n')
## Using Standard Selection []
print("DataFrame[] Selection:\n")
print(df['A'])
print(type(df['A']))

print('\n-------\n')
## Using .loc for selection of rows based on label/index
print(".loc for a ROW(s): \n")
print(df.loc[:1])

print('\n-------\n')
print(".iloc for a COLUMN(s): \n")
print(df.iloc[:,:2])


   A  B  C
0  a  1  2
1  b  3  4
2  c  5  6
3  c  5  8

-------

DataFrame[] Selection:

0    a
1    b
2    c
3    c
Name: A, dtype: object
<class 'pandas.core.series.Series'>

-------

.loc for a ROW(s): 

   A  B  C
0  a  1  2
1  b  3  4

-------

.iloc for a COLUMN(s): 

   A  B
0  a  1
1  b  3
2  c  5
3  c  5


In [202]:
## Percentage Change to previous row
csv.iloc[:,1].pct_change()

0         NaN
1   -0.222222
2   -0.142857
Name: age, dtype: float64

# General Functions

### Melt

In [152]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c', 3: 'c'},
                   'B': {0: 1, 1: 3, 2: 5, 3: 5},
                   'C': {0: 2, 1: 4, 2: 6, 3: 8}})
print(df)

pd.melt(df, id_vars=['A'], value_vars=['B'])

   A  B  C
0  a  1  2
1  b  3  4
2  c  5  6
3  c  5  8


Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,c,B,5


### Pivot

In [157]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
                           'two'],
                   'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'baz': [1, 2, 3, 4, 5, 6],
                   'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
print(df)

df.pivot(index='foo', columns='bar', values='baz')

   foo bar  baz zoo
0  one   A    1   x
1  one   B    2   y
2  one   C    3   z
3  two   A    4   q
4  two   B    5   w
5  two   C    6   t


bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


### Pivot Table

In [203]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
print(df)

df.pivot_table(values = ['E','D'], columns = 'C', aggfunc = ['sum','mean','max','min'])

     A    B      C  D  E
0  foo  one  small  1  2
1  foo  one  large  2  4
2  foo  one  large  2  5
3  foo  two  small  3  5
4  foo  two  small  3  6
5  bar  one  large  4  6
6  bar  one  small  5  8
7  bar  two  small  6  9
8  bar  two  large  7  9


Unnamed: 0_level_0,sum,sum,mean,mean,max,max,min,min
C,large,small,large,small,large,small,large,small
D,15,18,3.75,3.6,7,6,2,1
E,24,30,6.0,6.0,9,9,4,2


### Cut

In [175]:
s = pd.Series([2, 4, 6, 8, 10])
cut = pd.cut(s, 2)

pd.DataFrame()

0    (1.992, 6.0]
1    (1.992, 6.0]
2    (1.992, 6.0]
3     (6.0, 10.0]
4     (6.0, 10.0]
dtype: category
Categories (2, interval[float64]): [(1.992, 6.0] < (6.0, 10.0]]

# Windows Objects

In [284]:
import random
s_index = range(100)
s_col_1 = [random.randint(0,1000) for i in s_index]
s_col_2 = [round(random.random()*100,2) for i in s_index]


df = pd.DataFrame( list(zip(s_col_1, s_col_2)), 
                   columns =['col_1', 'col_2'],
                  index = s_index
                 ) 

## Rolling Windows
print("Rolling Windows\n")
print(df.iloc[:,1].head(20))
print(df.iloc[:,1].rolling(7).mean().head(20))

## Expanding Windows
print('\n\nExpanding Windows\n')
print(df.iloc[:,1].expanding().sum().tail(20))
print(f"Sum in col = {df.iloc[:,1].sum()}")

Rolling Windows

0     91.70
1     53.03
2     24.66
3     27.87
4     82.68
5     12.51
6     89.94
7      5.77
8     51.57
9     92.17
10    52.72
11    50.54
12    65.46
13    49.12
14    65.53
15    49.66
16    94.93
17    25.00
18    57.60
19    88.39
Name: col_2, dtype: float64
0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6     54.627143
7     42.351429
8     42.142857
9     51.787143
10    55.337143
11    50.745714
12    58.310000
13    52.478571
14    61.015714
15    60.742857
16    61.137143
17    57.177143
18    58.185714
19    61.461429
Name: col_2, dtype: float64


Expanding Windows

80    4071.13
81    4140.78
82    4141.08
83    4234.42
84    4235.24
85    4329.75
86    4383.30
87    4449.83
88    4549.68
89    4621.99
90    4624.80
91    4706.66
92    4788.13
93    4801.30
94    4821.60
95    4921.13
96    5013.83
97    5049.77
98    5121.85
99    5178.42
Name: col_2, dtype: float64
Sum in col = 5178.419999999998
