In [150]:
import pandas as pd
import numpy as np

# Set a seed for reproducibility
np.random.seed(42)

# Set row display for large datasets
pd.options.display.max_rows = 20

data = {
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=100),
    'Product_Category': np.random.choice(['Electronics', 'Clothing', 'Home Goods', 'Food'], size=100),
    'Sales_USD': np.random.randint(50, 500, size=100),
    'Units_Sold': np.random.randint(1, 10, size=100),
    'Customer_Rating': np.random.randint(1, 6, size=100), # 1 to 5 stars
    'Employee_Performance_Score': np.random.uniform(60, 100, size=100),
    'Has_Warranty': np.random.choice([True, False], size=100, p=[0.7, 0.3]),
    'Customer_Segment': np.random.choice(['New', 'Returning', 'Loyal'], size=100, p=[0.2, 0.5, 0.3]),
    'Delivery_Time_Days': np.random.randint(1, 15, size=100)
}
df = pd.DataFrame(data)

# Introduce some NaN values for robustness
for col in ['Sales_USD', 'Customer_Rating', 'Employee_Performance_Score']:
    df.loc[np.random.choice(df.index, 5, replace=False), col] = np.nan
df.head()

Unnamed: 0,Region,Product_Category,Sales_USD,Units_Sold,Customer_Rating,Employee_Performance_Score,Has_Warranty,Customer_Segment,Delivery_Time_Days
0,East,Home Goods,112.0,7,1.0,86.407895,True,Loyal,8
1,West,Clothing,401.0,8,1.0,71.197356,True,Loyal,6
2,North,Clothing,280.0,1,,98.194611,True,Loyal,14
3,East,Food,290.0,6,4.0,89.515877,True,Returning,10
4,East,Clothing,101.0,8,5.0,82.174162,True,Returning,12


## Exercise 1: Basic Descriptive Statistics

**Task:** Calculate the basic descriptive statistics (mean, median, min, max, std, count, quartiles) for all numerical columns in the DataFrame.

In [16]:
df.describe()

Unnamed: 0,Sales_USD,Units_Sold,Customer_Rating,Employee_Performance_Score,Delivery_Time_Days
count,95.0,100.0,95.0,95.0,100.0
mean,270.284211,4.78,2.894737,80.222087,7.63
std,120.987397,2.623052,1.440107,11.996773,3.818231
min,62.0,1.0,1.0,60.433506,1.0
25%,172.5,3.0,2.0,71.235773,4.75
50%,269.0,4.0,3.0,80.210095,7.5
75%,370.0,7.0,4.0,90.24778,10.25
max,495.0,9.0,5.0,99.620206,14.0


## Exercise 2: Correlation Analysis

**Scenario:** You want to understand the linear relationships between different numerical variables in your sales data.

**Task:**

1. Calculate the pairwise correlation matrix for all numerical columns.

2. Find the correlation between 'Sales_USD' and 'Units_Sold'.

3. Find the correlation between 'Sales_USD' and 'Employee_Performance_Score'.

In [23]:
# Correlation between all numeric (boolean) columns
df.corr(numeric_only= True)

# Correlation between 'Employee_Performance_Score, numeric_only= True' and all numerical columns
df.corrwith(df.Employee_Performance_Score, numeric_only= True)

Sales_USD                     0.079757
Units_Sold                   -0.002039
Customer_Rating              -0.182624
Employee_Performance_Score    1.000000
Has_Warranty                 -0.077833
Delivery_Time_Days           -0.028443
dtype: float64

In [20]:
#Correlation between all numeric (boolean) columns
df["Sales_USD"].corr(df["Units_Sold"])

np.float64(0.023621026296408467)

In [21]:
#Correlation between all numeric (boolean) columns
df["Sales_USD"].corr(df.Employee_Performance_Score)

np.float64(0.07975660495107388)

## Exercise 3: Covariance Analysis

**Scenario:** You want to understand how two numerical variables in your sales data change together.

**Task:**

1. Calculate the pairwise covariance matrix for all numerical columns.

2. Specifically, find the covariance between 'Sales_USD' and 'Units_Sold'.

In [2]:
# Covariance on all numerical columns
df.cov(numeric_only= True)

Unnamed: 0,Sales_USD,Units_Sold,Customer_Rating,Employee_Performance_Score,Has_Warranty,Delivery_Time_Days
Sales_USD,14637.95028,7.508959,-30.478755,117.583732,10.301904,-29.740985
Units_Sold,7.508959,6.880404,-0.397536,-0.06383,-0.055354,-1.253939
Customer_Rating,-30.478755,-0.397536,2.073908,-3.090019,0.080067,0.316349
Employee_Performance_Score,117.583732,-0.06383,-3.090019,143.922569,-0.449995,-1.322364
Has_Warranty,10.301904,-0.055354,0.080067,-0.449995,0.226667,0.125455
Delivery_Time_Days,-29.740985,-1.253939,0.316349,-1.322364,0.125455,14.578889


In [3]:
# Covariance in specific pair-wise columns
df["Sales_USD"].cov(df["Units_Sold"])

np.float64(7.5089585666293415)

## Exercise 4: Unique Values and Counts - Categorical Columns

**Scenario:** You need to explore the distinct categories and their frequencies within your categorical data.

**Task:**

1. Find all unique values in the 'Region' column.

2. Count the occurrences of each unique value in the 'Product_Category' column.

3. Count the number of unique customer segments.

In [None]:
# Extract unique values
unique_region= df["Region"].unique()
unique_region

array(['East', 'West', 'North', 'South'], dtype=object)

In [None]:
# Count unique values
ndarray= df.Customer_Segment.unique() # Returns a numpy array
pd.Series(ndarray).count()

np.int64(3)

## Exercise 5: Unique Values and Counts - Numerical/Boolean Columns

**Scenario:** You want to examine the distinct entries and their frequencies in columns that might appear numerical or boolean, including handling potential missing values.

**Task:**

1. Find all unique values in the 'Customer_Rating' column.

2. Count the occurrences of each unique value in the 'Has_Warranty' column.

3. Count the number of unique 'Delivery_Time_Days' values.

In [None]:
# Unique values
df["Customer_Rating"].unique()

array([ 1., nan,  4.,  5.,  3.,  2.])

In [21]:
# Count values
df["Has_Warranty"].value_counts()

Has_Warranty
True     66
False    34
Name: count, dtype: int64

In [22]:
# Count values
pd.Series(df["Delivery_Time_Days"].unique()).count()

np.int64(14)

## Exercise 6: Membership Check - isin() for Filtering

**Scenario:** You need to select rows where specific columns contain values from a predefined set.

**Task:**

1. Filter the DataFrame to show only rows where 'Product_Category' is either 'Electronics' or 'Food'.

2. Filter the DataFrame to show rows where 'Customer_Rating' is among [4, 5] (meaning 4 or 5 stars).

In [32]:
# Filtering by values
cond= df["Product_Category"].isin(("Food", "Electronics"))
df.loc[cond, "Product_Category"].sort_values()

41    Electronics
36    Electronics
37    Electronics
39    Electronics
48    Electronics
         ...     
28           Food
29           Food
30           Food
62           Food
3            Food
Name: Product_Category, Length: 50, dtype: object

In [34]:
# Filtering by values
cond= df["Customer_Rating"].isin((4, 5))
df.loc[cond, "Customer_Rating"].sort_values()

3     4.0
71    4.0
65    4.0
62    4.0
92    4.0
     ... 
9     5.0
5     5.0
4     5.0
66    5.0
93    5.0
Name: Customer_Rating, Length: 37, dtype: float64

## Exercise 7: Membership Check - isin() for Creating a New Column

**Scenario:** You want to summarize numerical data for specific subgroups within your dataset.

**Task:**

1. Create a new boolean column named 'Is_High_Value_Product' which is True if 'Product_Category' is 'Electronics' or 'Home Goods', and False otherwise.

2. Create a new boolean column named 'Is_Top_Rated_Customer' which is True if 'Customer_Rating' is 5, and False otherwise (handle NaN appropriately, perhaps by making them False or keeping them NaN).

In [None]:
# Create new boolean column
cond= df.Product_Category.isin(("Electronics", "Home Goods"))
df["Is_High_Value_Product"]= cond
df.head()

Unnamed: 0,Region,Product_Category,Sales_USD,Units_Sold,Customer_Rating,Employee_Performance_Score,Has_Warranty,Customer_Segment,Delivery_Time_Days,Is_High_Value_Product
0,East,Home Goods,112.0,7,1.0,86.407895,True,Loyal,8,True
1,West,Clothing,401.0,8,1.0,71.197356,True,Loyal,6,False
2,North,Clothing,280.0,1,,98.194611,True,Loyal,14,False
3,East,Food,290.0,6,4.0,89.515877,True,Returning,10,False
4,East,Clothing,101.0,8,5.0,82.174162,True,Returning,12,False
...,...,...,...,...,...,...,...,...,...,...
95,South,Clothing,209.0,1,2.0,73.146582,False,Returning,8,False
96,South,Clothing,247.0,5,2.0,86.900738,False,Loyal,6,False
97,West,Food,465.0,3,1.0,90.094981,True,Loyal,2,False
98,South,Electronics,296.0,4,2.0,91.663162,True,New,4,True


In [39]:
# Create new boolean column
cond= df.Customer_Rating == 5
df["Is_Top_Rated_Customer"]= cond
df.head()

Unnamed: 0,Region,Product_Category,Sales_USD,Units_Sold,Customer_Rating,Employee_Performance_Score,Has_Warranty,Customer_Segment,Delivery_Time_Days,Is_High_Value_Product,Is_Top_Rated_Customer
0,East,Home Goods,112.0,7,1.0,86.407895,True,Loyal,8,True,False
1,West,Clothing,401.0,8,1.0,71.197356,True,Loyal,6,False,False
2,North,Clothing,280.0,1,,98.194611,True,Loyal,14,False,False
3,East,Food,290.0,6,4.0,89.515877,True,Returning,10,False,False
4,East,Clothing,101.0,8,5.0,82.174162,True,Returning,12,False,True


## Exercise 8: Grouped Descriptive Statistics

**Scenario:** You want to summarize numerical data for specific subgroups within your dataset.

**Tasks:**

1. Calculate the average 'Sales_USD' and average 'Units_Sold' for each 'Region' in df.

2. Find the maximum 'Employee_Performance_Score' for each 'Product_Category' in df.

3. Count the number of customers (i.e., rows) in each 'Customer_Segment' in df.

In [151]:
# Calculate mean of specifc columns in unique 'Region's
region_groups= df.groupby("Region")
region_groups[["Sales_USD", "Units_Sold"]].mean()

Unnamed: 0_level_0,Sales_USD,Units_Sold
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
East,249.166667,5.041667
North,264.166667,3.8
South,290.833333,5.0
West,274.551724,5.033333


In [153]:
# Calculate max score for each group 'Product_Category'
prod_categ_group= df.groupby("Product_Category")
prod_categ_group["Employee_Performance_Score"].max()

Product_Category
Clothing       98.194611
Electronics    95.098881
Food           98.447623
Home Goods     99.620206
Name: Employee_Performance_Score, dtype: float64

In [152]:
# Count values of 'Customer Segment'
prod_categ_group["Customer_Segment"].value_counts()

Product_Category  Customer_Segment
Books             Loyal                1
Clothing          Returning            8
                  Loyal                6
                  New                  5
Electronics       Returning           13
                  New                  8
                  Loyal                5
Food              Returning           13
                  Loyal                8
                  New                  3
Home Goods        Returning           13
                  New                  9
                  Loyal                8
Name: count, dtype: int64

**Note:** *The operation `g= df.gruopby("col1"); g["col1"].count()` is equal to `g= df["col1"].value_counts()`. So when counting values in a column is better option to use `value_counts()` method instead of gruoping and counting*

## Exercise 9: Grouped Correlation

**Scenario:** You need to analyze the relationship between two variables, but this relationship might vary across different segments of your data.

**Task:**

1. Calculate the correlation between 'Sales_USD' and 'Units_Sold' separately for each 'Region' in df.

In [54]:
# Correlation
region_groups["Sales_USD"].cov(df["Units_Sold"])

Region
East    -31.268116
North   -42.862745
South   -13.405797
West     80.615764
Name: Sales_USD, dtype: float64

## Exercise 10: Combined Unique Counts and Aggregation

**Scenario:** You want to understand the diversity of values within certain categories and identify the most common characteristics of groups.

**Tasks:**

1. For each 'Product_Category' in df, find the number of unique 'Customer_Segment' values associated with it.

2. For each 'Customer_Segment' in df, find the most frequent 'Product_Category' purchased by that segment.

In [None]:
# Unique values
prod_categ_group["Customer_Segment"].unique().apply(lambda s: len(s))

Product_Category
Clothing       3
Electronics    3
Food           3
Home Goods     3
Name: Customer_Segment, dtype: int64

In [None]:
# Most purchased category
cust_seg_group["Product_Category"].max()

Customer_Segment
Loyal        Home Goods
New          Home Goods
Returning    Home Goods
Name: Product_Category, dtype: object