In [2]:
import pandas as pd
import numpy as np
## Pandas Data Structures
# Series
s_serie = pd.Series([1, -2, 5, 41], index=['a', 'b', 'c', 'd'])
# Arithmetic Operations
s_serie.add(s_serie)
s_serie.sub(s_serie)
s_serie.div(s_serie)
s_serie.mul(s_serie)

a       1
b       4
c      25
d    1681
dtype: int64

In [3]:
s_serie = pd.Series([2, 8, -1, 5], index=['one', 'two', 'three', 'four'])

In [4]:
s_serie

one      2
two      8
three   -1
four     5
dtype: int64

In [5]:
# DataFrame
data = {'Country': ['Iran', 'Turkey', 'Italy'],
  'Capital': ['Tehran', 'Ankara', 'Rome'],
  'Area': [1648000, 783562, 301230]}
country_df = pd.DataFrame(data,columns=['Country', 'Capital', 'Area'])

In [6]:
country_df

Unnamed: 0,Country,Capital,Area
0,Iran,Tehran,1648000
1,Turkey,Ankara,783562
2,Italy,Rome,301230


In [7]:
## I/O
# Read and Write to CSV
country_df.to_csv('country_df.csv', sep=',', encoding='utf-8-sig', index=False)
country_df.to_csv('country_df.txt', sep='\t', encoding='utf-8-sig', index=False)
pd.read_csv('country_df.csv', encoding='utf-8-sig')
pd.read_csv('country_df.csv', encoding='utf-8-sig', header=None, nrows=2)
pd.read_csv('country_df.txt', encoding='utf-8-sig', sep='\t')

Unnamed: 0,Country,Capital,Area
0,Iran,Tehran,1648000
1,Turkey,Ankara,783562
2,Italy,Rome,301230


In [8]:
# Read and Write to Excel
country_df.to_excel('country_df.xlsx', sheet_name='Sheet1', index=False)
pd.read_excel('country_df.xlsx', 'Sheet1')

Unnamed: 0,Country,Capital,Area
0,Iran,Tehran,1648000
1,Turkey,Ankara,783562
2,Italy,Rome,301230


In [9]:
# Read and Write to SQL Query or Database Table
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')
country_df.to_sql('myTable', engine)
pd.read_sql_table('myTable', engine)
pd.read_sql_query("SELECT * FROM myTable;", engine)
pd.read_sql("SELECT * FROM myTable;", engine) #read_sql()is a convenience wrapper around read_sql_table() and read_sql_query()

Unnamed: 0,index,Country,Capital,Area
0,0,Iran,Tehran,1648000
1,1,Turkey,Ankara,783562
2,2,Italy,Rome,301230


In [10]:
## Asking For Help
#help(pd.Series.loc) #Access a group of rows and columns by label(s) or a boolean array.

In [11]:
## Selection: Also see NumPy Arrays
# Getting
s_serie['two'] # Get one element
country_df[1:]

Unnamed: 0,Country,Capital,Area
1,Turkey,Ankara,783562
2,Italy,Rome,301230


In [12]:
# Selecting, Boolean Indexing & Setting
#By Position
country_df.iloc[:,0:2] # Select single value by row & column
country_df.iat[0,2]=country_df.iat[0,2]+1-1 # Select single value by row & column

In [13]:
#By Label
country_df.loc[0, 'Country'] # Select single value by row & column labels or boolean
country_df.at[0, 'Country']

'Iran'

In [14]:
#Boolean Indexing
s_serie[~(s_serie > 1)] # Series s where value is not >1
s_serie[(s_serie < -1) | (s_serie > 2)] # s where value is <-1 or >2
country_df[country_df['Area']>700000] # Use filter to adjust DataFrame

Unnamed: 0,Country,Capital,Area
0,Iran,Tehran,1648000
1,Turkey,Ankara,783562


In [15]:
s_serie[~(s_serie > 4)]

one      2
three   -1
dtype: int64

In [16]:
#Setting
s_serie['one'] = 6 # Set index a of Series s to 6

In [17]:
s_serie

one      6
two      8
three   -1
four     5
dtype: int64

In [18]:
## Dropping
s_serie.drop(['one', 'three']) # Drop values from rows (axis=0)
country_df.drop('Country', axis=1) # Drop values from columns(axis=1)

Unnamed: 0,Capital,Area
0,Tehran,1648000
1,Ankara,783562
2,Rome,301230


In [35]:
## Sort & Rank
print(country_df.sort_index()) # Sort by labels along an axis
print(country_df.sort_values(by='Country')) # Sort by the values along an axis
print(country_df.rank()) # Assign ranks to entries

  Country Capital     Area
0    Iran  Tehran  1648000
1  Turkey  Ankara   783562
2   Italy    Rome   301230
  Country Capital     Area
0    Iran  Tehran  1648000
2   Italy    Rome   301230
1  Turkey  Ankara   783562
   Country  Capital  Area
0      1.0      3.0   3.0
1      3.0      1.0   2.0
2      2.0      2.0   1.0


In [36]:
country_df

Unnamed: 0,Country,Capital,Area
0,Iran,Tehran,1648000
1,Turkey,Ankara,783562
2,Italy,Rome,301230


In [37]:
## Retrieving Series/DataFrame Information
# Basic Information
country_df.shape #(rows,columns)
country_df.index # Describe index
print(country_df.columns) # Describe DataFrame columns
country_df.info() # Info on DataFrame
country_df.count() # Number of non-NA values

Index(['Country', 'Capital', 'Area'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  3 non-null      object
 1   Capital  3 non-null      object
 2   Area     3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


Country    3
Capital    3
Area       3
dtype: int64

In [38]:
country_df.count() # Number of non-NA values

Country    3
Capital    3
Area       3
dtype: int64

In [23]:
# Summary
country_df.sum() # Sum of values
country_df.cumsum() # Cummulative sum of values
country_df.min() # Minimum/maximum values
country_df.describe() # Summary statistics
country_df['Area'].mean() # Mean of values
country_df['Area'].median() # Median of values

783562.0

In [24]:
country_df.describe()

Unnamed: 0,Area
count,3.0
mean,910930.7
std,682359.5
min,301230.0
25%,542396.0
50%,783562.0
75%,1215781.0
max,1648000.0


In [39]:
## Applying Functions
power = lambda x: x*2
print(country_df.applymap(power)) # Apply function element-wise
city_df = pd.DataFrame(np.random.randn(3, 4), columns=list('1234'), index=['Newyork', 'Tehran', 'Washington'])
print(city_df)
diff = lambda x: x.max() - x.min()
print(city_df.apply(diff))
print(city_df.apply(diff,axis=1))

        Country       Capital     Area
0      IranIran  TehranTehran  3296000
1  TurkeyTurkey  AnkaraAnkara  1567124
2    ItalyItaly      RomeRome   602460
                   1         2         3         4
Newyork     0.148263 -0.416139  0.407488  0.323748
Tehran      0.006799  0.860009  2.272987  0.139109
Washington  0.782918 -1.386636 -0.458309  0.877210
1    0.776119
2    2.246645
3    2.731296
4    0.738101
dtype: float64
Newyork       0.823627
Tehran        2.266188
Washington    2.263846
dtype: float64


In [40]:
city_df.apply(diff)

1    0.776119
2    2.246645
3    2.731296
4    0.738101
dtype: float64

In [42]:
## Missing Values
s_serie.loc['two']=None
print(s_serie.isnull())
s_serie.dropna()
s_serie.fillna(s_serie.dropna().mean())

one      False
two       True
three    False
four     False
dtype: bool


one      6.000000
two      3.333333
three   -1.000000
four     5.000000
dtype: float64

In [43]:
s_serie

one      6.0
two      NaN
three   -1.0
four     5.0
dtype: float64

In [44]:
s_serie.fillna(s_serie.dropna().mean())

one      6.000000
two      3.333333
three   -1.000000
four     5.000000
dtype: float64

In [45]:
s_serie.dropna()

one      6.0
three   -1.0
four     5.0
dtype: float64