In [2]:
import numpy as np
import matplotlib as plt
import pandas as pd

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan],
                   [0.75,-1.3]],
                  index=['a','b','c','d'],
                  columns= ['mondstadt', 'inazuma'])

df 


Unnamed: 0,mondstadt,inazuma
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
df.sum()


mondstadt    9.25
inazuma     -5.80
dtype: float64

In [4]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [5]:
# in order to sort data Nan (not available) only

df.sum(axis='index', skipna=False)

mondstadt   NaN
inazuma     NaN
dtype: float64

In [6]:
# in order to sort data Nan (not available) only
# summarizing all columns with int / float 
# values of variables

df.sum(axis='columns', skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [7]:
df.mean(axis='columns')

# this giving summarize averagely


a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [8]:
# axis - Axis to reduce over; "index" for DataFrame's row
# and "column" for columns

# skipna - Exclude missing values; True by default

# level - Reduce grouped by level if the axis is
# hierarically indexed(multi index)

In [9]:
df.idxmax()


mondstadt    b
inazuma      d
dtype: object

In [10]:
df.cumsum()
# cummulating all data set in each column
# 1.4(row 0) ; 1.4+row(1) ; c.row(1) + row(2) or Nan 
# ; 8.5 + 0.75 

Unnamed: 0,mondstadt,inazuma
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [11]:
df.describe()
# describe is producing multiple summary statistic in one shot

Unnamed: 0,mondstadt,inazuma
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [12]:
# let's check some of them above
df.count()

mondstadt    3
inazuma      2
dtype: int64

In [13]:
# let's check some of them above
df.min()

mondstadt    0.75
inazuma     -4.50
dtype: float64

In [14]:
# To get the 25th percentile (first quartile) for each column:
df.quantile(0.25)

mondstadt    1.075
inazuma     -3.700
Name: 0.25, dtype: float64

In [15]:
obj = pd.Series(["a","a","b","c"] * 4)

obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [16]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [17]:
# count    - Number of non-NA values
# describe - Compute set of summary statistics
# min, max - Compute minimum and maximum values

# argmin,arg max - Compute index locations(integers) at which 
# minimum or maximum value is obtained, respectively;
# not available on DataFrame objects

# idxmin, idxmax - Compute index labels at which minimum or
# maximum value is obtained, respectively

# quantile - Compute sample quantile ranging from 0 to 1 
# (default: 0.5). e.g 25%, 50%, 100%

# sum  - Sum of values
# mean - Mean of values

# median - Arithmetic median (50% quantile) of values
# mad    - MEan abosilute deviation from mean value
# prod   - Product of values
# var    - Sample variance of values
# std    - Sample standard deviations of values
# skew   - Sample skewness (third moment) of values
# kurt   - Sample kurtosis (fourth  moment) of values
# cumsum - Cumulative sum of values

# cummin, cummax - Cumulative minimun or maximum of values,
# respectively

# cumprod - Cumulative product of values

# diff    - Compute first arithmetic difference 
# (useful for time series)

# pct_change  - Compute percent changes


In [18]:
# Correlation and Covariance
import pandas as pd

price = pd.read_pickle("G:\ebook\Certificate & Skill\Revo U - DAMC\Study Case 7-18 april 2025\Tableau Data Source Samples\DAMC_google_data.pkl")

price_each_df = price[["PRICE EACH"]]
qorder_df = price[["QUANTITY ORDER"]]

price_each_df.head()
#qorder_df.head()

  price = pd.read_pickle("G:\ebook\Certificate & Skill\Revo U - DAMC\Study Case 7-18 april 2025\Tableau Data Source Samples\DAMC_google_data.pkl")


Unnamed: 0,PRICE EACH
0,1701.95
1,2218.41
2,3843.67
3,558.43
4,6524.19


In [19]:
df = pd.read_pickle("G:\ebook\Certificate & Skill\Revo U - DAMC\Study Case 7-18 april 2025\Tableau Data Source Samples\DAMC_google_data.pkl")

# Select specific columns
filtered_df = df[["CUSTOMER NAME", "SALES", "PRICE EACH", "QUANTITY ORDER"]]
# Replace with actual column names

filtered_df.head()

  df = pd.read_pickle("G:\ebook\Certificate & Skill\Revo U - DAMC\Study Case 7-18 april 2025\Tableau Data Source Samples\DAMC_google_data.pkl")


Unnamed: 0,CUSTOMER NAME,SALES,PRICE EACH,QUANTITY ORDER
0,Alpha Cognac,70488.44,1701.95,687
1,Amica Models &,94117.26,2218.41,843
2,Anna's Decorati,153996.13,3843.67,1469
3,Atelier graphiqu,24179.96,558.43,270
4,Australian Colle,265586.87,6524.19,2631


In [20]:
price_return = price_each_df.pct_change().tail()

price_return.tail()

Unnamed: 0,PRICE EACH
87,0.065495
88,-0.222986
89,-0.184037
90,-0.400636
91,228.069215


In [21]:
price_return2 = qorder_df.pct_change().tail()

price_return2.tail()

Unnamed: 0,QUANTITY ORDER
87,0.030593
88,-0.269944
89,-0.177891
90,-0.210201
91,192.868885


In [22]:
price_return3 = price_each_df.pct_change()

price_return3

Unnamed: 0,PRICE EACH
0,
1,0.303452
2,0.732624
3,-0.854714
4,10.683094
...,...
87,0.065495
88,-0.222986
89,-0.184037
90,-0.400636


In [23]:
Quantity_return = qorder_df.pct_change()

Quantity_return

Unnamed: 0,QUANTITY ORDER
0,
1,0.227074
2,0.742586
3,-0.816201
4,8.744444
...,...
87,0.030593
88,-0.269944
89,-0.177891
90,-0.210201


In [24]:
filtered_df2 = df[[ "SALES", "PRICE EACH", "QUANTITY ORDER"]]

returns = filtered_df2.pct_change()
returns


Unnamed: 0,SALES,PRICE EACH,QUANTITY ORDER
0,,,
1,0.335216,0.303452,0.227074
2,0.636216,0.732624,0.742586
3,-0.842983,-0.854714,-0.816201
4,9.983760,10.683094,8.744444
...,...,...,...
87,-0.002497,0.065495,0.030593
88,-0.252072,-0.222986,-0.269944
89,-0.139553,-0.184037,-0.177891
90,-0.391661,-0.400636,-0.210201


In [25]:
returns["QUANTITY ORDER"].corr(returns["PRICE EACH"])


np.float64(0.9998858880180003)

In [26]:
returns["QUANTITY ORDER"].cov(returns["QUANTITY ORDER"])

np.float64(409.7880400249636)

In [27]:
returns.corr()

Unnamed: 0,SALES,PRICE EACH,QUANTITY ORDER
SALES,1.0,0.999917,0.999954
PRICE EACH,0.999917,1.0,0.999886
QUANTITY ORDER,0.999954,0.999886,1.0


In [28]:
returns.cov()

Unnamed: 0,SALES,PRICE EACH,QUANTITY ORDER
SALES,517.194724,544.139011,460.348423
PRICE EACH,544.139011,572.581545,484.338231
QUANTITY ORDER,460.348423,484.338231,409.78804


In [29]:
returns.corrwith(returns["PRICE EACH"])

SALES             0.999917
PRICE EACH        1.000000
QUANTITY ORDER    0.999886
dtype: float64

In [30]:
returns.corrwith(returns["SALES"])

SALES             1.000000
PRICE EACH        0.999917
QUANTITY ORDER    0.999954
dtype: float64

In [31]:
returns.corrwith(returns["QUANTITY ORDER"])

SALES             0.999954
PRICE EACH        0.999886
QUANTITY ORDER    1.000000
dtype: float64

In [32]:
# Unique Values, Value Counts, and Membership

obj = pd.Series(["c","a","d","a","a","b","b","c","c"])

uniques= obj.unique()

uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [33]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [34]:
pd.value_counts(obj.to_numpy(), sort=False)

  pd.value_counts(obj.to_numpy(), sort=False)


c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [35]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [36]:
mask = obj.isin(["b","c"])

mask
# isin performs a vectorized set membership check 
# and useful for filtering a dataset down to a subset of values
# in a Series or column in a DataFrame

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [37]:
mask2 = obj.isin(["a","d"])

mask2

0    False
1     True
2     True
3     True
4     True
5    False
6    False
7    False
8    False
dtype: bool

In [38]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [39]:
obj[mask2]

1    a
2    d
3    a
4    a
dtype: object

In [40]:
to_match = pd.Series(["c","a", "b", "b","c","a"])

unique_vals = pd.Series(["c", "b","a"])
indices = pd.Index(unique_vals).get_indexer(to_match)

indices

array([0, 2, 1, 1, 0, 2])

In [41]:
# isin - Compute Boolean array indicating wether each series 
# or DataFrame value is contained in the passed 
# sequence of values

# get_indexer - Compute integer indices for each value in 
# an array into another array of distinct values; helpful 
# for data alignment and join-type operations.

# value_counts - Return a Series containing unique as 
# its index and frequencies as its values, ordered count in
# descending order   

In [42]:
data = pd.DataFrame({"Qu1":[1,3,4,3,4],
                    "Qu2": [2,3,1,2,3],
                    "Qu3": [1,5,2,4,4]})

data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [43]:
data_mod = pd.DataFrame({"Qu1":[1,3,4,3,4],
                    "Qu2": [2,3,1,2,3],
                    "Qu3": [1,5,2,4,4]}, index=("EI","Yae","Chiori","Ayaya","Yoimiya"))

data_mod

Unnamed: 0,Qu1,Qu2,Qu3
EI,1,2,1
Yae,3,3,5
Chiori,4,1,2
Ayaya,3,2,4
Yoimiya,4,3,4


In [44]:
data["Qu1"].value_counts().sort_index()

Qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [45]:
data_mod["Qu1"].value_counts().sort_index()

Qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [46]:
result = data.apply(pd.value_counts).fillna(0)

result

  result = data.apply(pd.value_counts).fillna(0)


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [47]:
result = data_mod.apply(pd.value_counts).fillna(0)

result

  result = data_mod.apply(pd.value_counts).fillna(0)


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [48]:
data = pd.DataFrame({"a" :[1,1,1,2,2], 
                     "b": [0,0,1,0,0]})

data

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [49]:
data.value_counts()

a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64