# Data manipulation and visualization using Pandas and Matplotlib

## 1. Data loading 

### 1.1. Panda Libraries - options & versions

In [1]:
import pandas

# Edit lines displayed amount
pandas.options.display.max_rows =10

# Version check
print(pandas.__version__)

0.20.3


### 1.2. Dataframe Structure

In [2]:
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

# Make sure it's a dataframe
print(type(df))

# Display the dataframe first lines
# df.head()

# Number lines + columns
# print(df.shape)

# Columns type
# print(df.dtypes)

# Data description
# df.describe(include='all')

FileNotFoundError: [Errno 2] No such file or directory: 'your_file.txt'

## 2. Variables manipulation

### 2.1. Variables access

In [14]:
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

# Variable description 
print(df.var.describe())

# Variable mean
# print(df.var.mean())

# Value count
# print(df.var.value_counts())

# Sort a variable values by ascending order
# print(df.var.sort_values())

count    270.000000
mean      54.433333
std        9.109067
min       29.000000
25%       48.000000
50%       55.000000
75%       61.000000
max       77.000000
Name: age, dtype: float64


### 2.2. Variable iterations

In [23]:
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

# Call back
def operation(x):
    return(x.mean())

# Function called on all Data Frame columns
# axis=0 ==> each column is transmitted to the operation() method
# select_dtype() excludes non digital variables
print(df.select_dtypes(exclude=['object']).apply(operation, axis=0))

age            54.433333
pression      131.344444
cholester     249.659259
taux_max      149.677778
depression     10.500000
pic             1.585185
dtype: float64


## 3. Indexes in Data Frames


In [24]:
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

# (0,0) value
df.iloc[0,0]

# First 5 values of all columns
# lines ==> 0:5 (5 excluded)
# columns ==> : (all columns)
df.iloc[0:5,:]

# 5 last lines
df.iloc[-5:,:]

# 5 first lines & 2 first columns
df.iloc[0:5,0:2]

# 5 first lines & columns 0,1,4
df.iloc[0:5,[0,1,4]]

70


## 4. Restrictions using conditions - requests

In [33]:
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

# Value set of the same variable : usage of isin()
df.loc[df['value_set'].isin(['A','B']), :]

# Using conditions and logical elements
df.loc[(df['value_set']=="A") & (df['vaue_set2']=="B")]
# & == AND, | == OR, ~ == !


Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
2,57,masculin,B,124,261,A,A,141,non,3,1,A,presence
4,74,feminin,B,120,269,A,C,121,oui,2,1,B,absence
13,61,masculin,A,134,234,A,A,145,non,26,2,C,presence
18,64,masculin,A,110,211,A,C,144,oui,18,2,A,absence
19,40,masculin,A,140,199,A,A,178,oui,14,1,A,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,58,masculin,B,120,284,A,C,160,non,18,2,A,presence
263,49,masculin,B,130,266,A,A,171,non,6,1,A,absence
264,48,masculin,B,110,229,A,A,168,non,10,3,A,presence
266,44,masculin,B,120,263,A,A,173,non,0,1,A,absence


## 5. Crosstab

In [37]:
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

# Crosstabing
pandas.crosstab(df['var1'], df['var2'])

# Normalized
pandas.crosstab(df['var1'], df['var2'], normalize='index')

# group by
g = df.groupby('thing')

# sub-DF dimensions
print(g.get_group('thing').shape)

coeur,absence,presence
sexe,Unnamed: 1_level_1,Unnamed: 2_level_1
feminin,67,20
masculin,83,100


## 6. Computed values construction

In [48]:
import numpy
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

print(df['var'] * numpy.log(df['var']))

# DataFrame concat
print(pandas.concat([df, 'var'], axis=1).shape)

0      463.085981
1      672.750819
2      570.070229
3      436.682724
4      520.791876
          ...    
265    640.101482
266    654.664807
267    615.878809
268    598.371588
269    454.106803
dtype: float64


## 7. Graphs

In [52]:
import matplotlib.pyplot as plt
import pandas

# File loading
df = pandas.read_table("your_file.txt", sep ='\t',  header=0)

# histogram
df.hist(column='var', by='var2')

# density plot
# df['var'].plot.kde()

# comparaison des distributions avec boxplot
# central value of the graph = median
# edges of the rectangle = quartiles 
# ends of the whiskers are calculated using 1.5 times the interquartile range 
# df.boxplot(column='var', by='var2')

# scatterplot
# df.plot.scatter(x='var', y='var2', c='var3')

# hexbin
# df.plot.hexbin(x='var', y='var2', gridsize=25)
# df.plot.hexbin(x='var', y='var2', C='var3', reduce_C_function=numpy.mean, gridsize=25)

# pie chart
# df['var'].value_counts().plot.pie()

# Vectors & matrices using Numpy

## 1. Creating a vector

### 1.1. Via manual input

In [None]:
import numpy as np

# Array creation through manual input
a = np.array([1.2, 2.5, 3.2, 1.8])

# Data type
a.dtype

# number of  dimensions
a.ndim

# number of lines & columns
a.shape

# total amount of values
a.size

# Forcing the type
b = np.array([1,2,4], dtype=float)
d = np.array([{"A":(45, 200)}, {"B":(34, 150)}])

### 1.2. Creating value sequences

In [None]:
import numpy as np

# arithmetic progression
a = np.arange(start=0, stop=10, step=1)

# only made of 1
a = np.ones(shape=5)

# only made of fill_value
a = np.full(shape=5, fill_value=3.2)

# from a file
a = np.loadtxt("file.txt", dtype=float)

## 2. Extracting values

In [None]:
import numpy as np 

v = np.array([1.2, 7.4, 4.2, 8.5, 6.3])

# extraction through booleans
b = np.array([False, True, False, True, False], dtype=bool)
v[b]

# condition 
v[v<7]

# using a boolean vector
b = v<7

# basics
np.max(v)
np.argmax(v)
np.sort(v)
np.argsort(v)

# distinctive values
np.unique(np.array([1,2,2,2,1,1,2]))

## 3. Vectors & algebra

In [None]:
import numpy as np 

v = np.array([1.2, 7.4, 4.2, 8.5, 6.3])

# basics
np.mean(v)
np.median(v)
np.var(v)
np.std(v)
np.sum(v)

# percentile
np.percentile(v, 50) 

# cumulative sum
np.cumsum(v)

# scalar product
np.vdot(v, v[::-1])

# vector norm
np.linalg.norm(v)

# through logic
a = np.array([True,True,False,True],dtype=bool)
b = np.array([True,False,True,False],dtype=bool)

# basics
np.logical_and(a,b)
np.logical_or(a,b)
np.logical_not(a,b)
np.logical_xor(a,b)

# operations on sets
x = np.array([1,2,5,7])
y = np.array([2,4,6,1])

# intersection
np.intersect1d(x, y)

# union
np.union1d(x, y)

# anything in x that isn't in y
np.setdiff1d(x,y)

## 4. Matrices

In [None]:
import numpy as np

a = np.array([[1.2,2.5],[3.2, 1.8],[1.1,4.3]])

# same basics as vectors
type(a)
a.dtype
a.ndim
a.shape
a.size

# arange() generatees a sequence of values
# reshape() re-organises the data into lines & columns
np.arange(0, 10).reshape(2, 5)

# converting a vector into a matrix
np.array([2.1,3.4,6.7,8.1,3.5,7.2]).shape

# matrix with the same value
np.full(shape=(2, 4), fill_value=0.1)

# using a file
np.loadtxt("file.txt", delimiter="\t", dtype=float)

# adding a line 
b = np.array([[4.1,2.6]])
np.append(a, b, axis=0)

# adding a column
d = np.array([[7.8],[6.1],[5.4]])
np.append(a, d, axis=1)

# reshaping a matrix
np.resize(a, new_shape=(2,3))

## 5. Matrices & algebra

In [None]:
import numpy as np

x = np.array([[1.2,2.5],[3.2,1.8],[1.1,4.3]])
y = np.array([[2.1,0.8],[1.3,2.5]])

# transposition
np.transpose(x)

# multiplication
np.dot(x, y)

# determinant
np.linalg.det(y)

# inversion
np.linalg.inv(y)

# solving linear system (y.a = z)
z = np.array([1.7, 1.0])
np.linalg.solve(y,z)

# verifying a = y^(-1).z
np.dot(np.linalg.inv(y), z)

# symmetrical matrix using x^t.x
s = np.dot(np.transpose(x), x)

# Eigenvalues & eigenvectors
np.linalg.eigh(s)