In [4]:
import pandas as pd
import random
import numpy as np

N=1000
random.seed(2022)

data = pd.DataFrame({'group':np.random.choice(['a','b','c'], N), 'ints': np.random.binomial(10,0.5,N), 'reals': np.random.normal(0,1,N)})

#I/O commands
data.to_csv("Desktop/Data_Ex_1_Python.csv", index = False)
data.head()


In [5]:
%reset

In [25]:
import pandas as pd
data = pd.read_csv("Desktop/Data_Ex_1_Python.csv", sep= ",",header = 0)
data.head()

Unnamed: 0,group,ints,reals
0,b,6,-0.954936
1,a,5,-0.502914
2,a,5,-2.365702
3,c,4,-0.078547
4,b,4,0.107248


In [26]:
# get 10th row
data.iloc[10]

group           a
ints            3
reals    0.472507
Name: 10, dtype: object

In [27]:
# get the entire group column
data["group"]


0      b
1      a
2      a
3      c
4      b
      ..
995    c
996    b
997    a
998    a
999    a
Name: group, Length: 1000, dtype: object

In [28]:
stats = pd.DataFrame()
stats['count'] = data.groupby(['group'])['ints'].size()
stats['mean'] = data.groupby(['group'])['ints'].mean()
stats

Unnamed: 0_level_0,count,mean
group,Unnamed: 1_level_1,Unnamed: 2_level_1
a,329,4.990881
b,334,5.008982
c,337,4.952522


In [31]:
data['ratio']= data['reals']/data['ints']
data.sort_values(by="group", ascending=True)

Unnamed: 0,group,ints,reals,ratio
999,a,4,0.991452,0.247863
315,a,6,0.526527,0.087754
671,a,7,0.293198,0.041885
673,a,5,-1.102843,-0.220569
311,a,8,0.837291,0.104661
...,...,...,...,...
701,c,2,-0.246822,-0.123411
286,c,7,1.406589,0.200941
287,c,4,1.826719,0.456680
275,c,3,0.261140,0.087047


In [51]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
iris_df= pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                 columns= iris['feature_names'] + ['target'])
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
mini_iris = iris_df.groupby(['species']).sample(1)
pd.melt(mini_iris, var_name="flower_att", value_name="measurement")

Unnamed: 0,flower_att,measurement
0,sepal length (cm),5.7
1,sepal length (cm),5.9
2,sepal length (cm),6.3
3,sepal width (cm),4.4
4,sepal width (cm),3.2
5,sepal width (cm),2.8
6,petal length (cm),1.5
7,petal length (cm),4.8
8,petal length (cm),5.1
9,petal width (cm),0.4


### Parallelisation RHadoop

In [52]:
datap = pd.read_csv("Desktop/Data_Ex_1_Python.csv", sep= ",",header = 0)
datap.head()

Unnamed: 0,group,ints,reals
0,b,6,-0.954936
1,a,5,-0.502914
2,a,5,-2.365702
3,c,4,-0.078547
4,b,4,0.107248


In [61]:
data_sum = pd.DataFrame() 
data_sum= datap.iloc[:, ~datap.columns.isin(['group'])].sum()
data_sum

ints     4984.000000
reals      40.951234
dtype: float64

In [72]:
data_abs = datap.iloc[:, ~datap.columns.isin(['group'])].copy()
data_abs = data_abs.apply(lambda x: np.square(x))
data_abs

Unnamed: 0,ints,reals
0,36,0.911903
1,25,0.252922
2,25,5.596545
3,16,0.006170
4,16,0.011502
...,...,...
995,9,0.006724
996,25,0.005638
997,16,1.589046
998,25,0.676852


In [84]:
import time

#for loop

N=3000
num=12
zeros = [0]*num
start = time.time()
for i in range(num):
    A = np.random.rand(N,N)
    zeros[i] = np.sum(A)
stop = time.time()
print("Runtime: ",stop-start)


Runtime:  0.5580956935882568


In [95]:
# for loop parallelized with joblib

from joblib import Parallel, delayed

def sumElements(i):
    A = np.random.rand(N,N)
    return np.sum(A)

start=time.time()
results = Parallel(n_jobs=2)(delayed(sumElements)(i) for i in range(num))
stop=time.time()
print("Runtime: ",stop-start)

results

Runtime:  0.3229191303253174


[4501701.370083759,
 4499664.182185877,
 4500408.267951957,
 4499305.746220831,
 4500971.945754516,
 4499603.31322136,
 4501261.671116629,
 4499508.371180943,
 4499329.659198114,
 4500189.325287676,
 4499286.706097498,
 4501099.158370171]