## Introduction to Data Analysis with Python

### Python has very rich data analysis ecosystem supported by the academic field

### We will focus only on a specific library - Pandas

In [17]:
# The work with Pandas usually begins like this:
import pandas as pd

# Read CSV file into Pandas DataFrame
df = pd.read_csv('sample.csv')

# But that is not the only way -
numbers = [1, 2, 3, 4, 5]
words = ['one', 'two', 'three', 'four', 'five']

# Create Series (array like structures) of data 
number_series = pd.Series(numbers)
word_series = pd.Series(words)

# Construct a dictionary to identify column names of our DataFrame
number_word_dict = {
    'numbers': numbers,
    'words': words
}

df2 = pd.DataFrame().from_dict(number_word_dict)

# Output dataframes just by writing variable name (one output per cell)
df2

Unnamed: 0,numbers,words
0,1,one
1,2,two
2,3,three
3,4,four
4,5,five


In [21]:
# Finding amount of rows - the classic way. Seriously, len() is pretty good in Python
len(df2)

5

### Explore the DataFrame

In [31]:
# Print first rows
df2.head()

# Print last 2 rows
df2.tail(2)

# Random sample of 3 rows
df2.sample(3)

# Calculate statistical measures of the DataFrame
df2.describe()

# Memory footprint and kind of cool stuff
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
numbers    5 non-null int64
words      5 non-null object
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes


### Modifying the DataFrame

In [43]:
# New column? Why not
df2['column'] = 1

# Why not make it more exciting - value from two existing columns
df2['numbers_and_words'] = df2.apply(lambda x: "{} - {}".format(x.numbers, x.words), axis=1)

df2

Unnamed: 0,numbers,words,numbers_and_words,column
0,1,one,1 - one,1
1,2,two,2 - two,1
2,3,three,3 - three,1
3,4,four,4 - four,1
4,5,five,5 - five,1


In [47]:
# Select word from the second row
df2.words.iloc[1]

'two'

### Filtering data in DataFrame