In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv('customer_transactions_large.csv')
df.head(3)

Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,Category,TransactionDate,Location
0,1001,23,Male,961,Electronics,2024-01-28,Houston
1,1002,51,Female,1511,Furniture,2024-05-27,Chicago
2,1003,18,Male,1927,Clothing,2024-04-25,San Francisco


In [3]:
df['Age'].mean()

41.71

In [4]:
df['Age'].mode()

0    32
Name: Age, dtype: int64

In [5]:
df['Age'].median()

40.0

In [6]:
df.columns

Index(['CustomerID', 'Age', 'Gender', 'PurchaseAmount', 'Category',
       'TransactionDate', 'Location'],
      dtype='object')

In [7]:
#What is the average and range (min-max) of the purchase amount?
df['PurchaseAmount'].mean()

1129.27

In [8]:
df['PurchaseAmount'].max()

1967

In [9]:
df['PurchaseAmount'].min()

139

In [11]:
#How spread out are the purchase amounts (standard deviation or IQR)?
std_p = df['PurchaseAmount'].std()
std_p

579.7069593224396

In [17]:
q1 = df['PurchaseAmount'].quantile(0.25)
q3 = df['PurchaseAmount'].quantile(0.75)

In [18]:
print(q1)
print(q2)

588.5
1712.25


In [19]:
IQR = q3-q1
IQR

1123.75

In [20]:
lower = q1 - 1.5*IQR
upper = q1 + 1.5 * IQR

In [22]:
#Find Outliar 
out = df[(df['PurchaseAmount']< lower) | (df['PurchaseAmount'] > upper)]
out

Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,Category,TransactionDate,Location


In [23]:
#How many transactions occurred in each product category?
df['Category'].unique()

array(['Electronics', 'Furniture', 'Clothing', 'Groceries'], dtype=object)

In [24]:
#What is the distribution of genders in the dataset?
df['Gender'].nunique()

2

In [25]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [26]:
#Which location has the most transactions?
df['Location'].max()

'San Francisco'

In [27]:
#Are there any unusual or inconsistent values in Category or Gender?
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [28]:
df.head(3)

Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,Category,TransactionDate,Location
0,1001,23,Male,961,Electronics,2024-01-28,Houston
1,1002,51,Female,1511,Furniture,2024-05-27,Chicago
2,1003,18,Male,1927,Clothing,2024-04-25,San Francisco


In [31]:
#How many transactions occurred over time (by month or day)?
df.groupby('TransactionDate')['Category']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002392AEB03D0>

In [32]:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])

In [37]:
b = df.groupby(['TransactionDate','Category']).size()
b

TransactionDate  Category   
2024-01-01       Electronics    1
2024-01-02       Electronics    1
                 Furniture      2
2024-01-03       Groceries      1
2024-01-04       Furniture      1
                               ..
2024-05-23       Electronics    1
2024-05-26       Furniture      1
2024-05-27       Furniture      2
2024-05-28       Clothing       1
2024-05-31       Groceries      2
Length: 92, dtype: int64

In [39]:
#What is the busiest day of the week for transactions?
c = df.groupby(['TransactionDate','Category']).max()
c

Unnamed: 0_level_0,Unnamed: 1_level_0,CustomerID,Age,Gender,PurchaseAmount,Location
TransactionDate,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01,Electronics,1038,52,Female,1840,Chicago
2024-01-02,Electronics,1088,64,Female,1110,Houston
2024-01-02,Furniture,1056,63,Male,1104,New York
2024-01-03,Groceries,1012,30,Female,1017,Houston
2024-01-04,Furniture,1074,32,Male,633,Chicago
...,...,...,...,...,...,...
2024-05-23,Electronics,1054,24,Male,1803,San Francisco
2024-05-26,Furniture,1033,35,Male,1803,San Francisco
2024-05-27,Furniture,1043,51,Female,1511,Chicago
2024-05-28,Clothing,1080,27,Female,1770,Chicago


In [40]:
#Are there any missing values in the dataset?
df.isnull().sum()

CustomerID         0
Age                0
Gender             0
PurchaseAmount     0
Category           0
TransactionDate    0
Location           0
dtype: int64

In [43]:
#Are there any duplicate rows?
df.duplicated().sum()

0

In [46]:
#Are all TransactionDate values in a valid date format?
df['TransactionDate'].nunique()

75

In [48]:
df['TransactionDate'].isnull().sum()

0

In [None]:
#Are there any outliers in Age or PurchaseAmount?
q1 = df['PurchaseAmount'].quantile(0.25)
q3 = df['PurchaseAmount'].quantile(0.75)

In [49]:
#Are the categories in Gender, Location, and Category consistent (e.g., typos or extra spaces)?
df['Category'].unique()

array(['Electronics', 'Furniture', 'Clothing', 'Groceries'], dtype=object)

In [51]:
#Can we extract day, month, or weekday from TransactionDate?
df['D'] = df['TransactionDate'].dt.day
df['M'] = df['TransactionDate'].dt.month
df['W'] = df['TransactionDate'].dt.weekday

In [52]:
df.head(3)

Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,Category,TransactionDate,Location,M,D,W
0,1001,23,Male,961,Electronics,2024-01-28,Houston,1,28,6
1,1002,51,Female,1511,Furniture,2024-05-27,Chicago,5,27,0
2,1003,18,Male,1927,Clothing,2024-04-25,San Francisco,4,25,3


In [56]:
df['PurchaseAmount'].unique()

array([ 961, 1511, 1927, 1263, 1722,  350, 1842, 1693, 1945, 1102, 1020,
       1017, 1289, 1521, 1674,  625, 1151, 1967, 1761, 1493,  366,  483,
       1074,  769,  577, 1381,  581, 1376, 1345, 1053, 1793,  860, 1803,
        716,  591,  880,  234, 1840, 1764,  478, 1792, 1818,  905, 1732,
       1119,  142,  194, 1423, 1024,  233,  204, 1208,  372,  397, 1104,
       1260, 1769, 1908, 1930, 1713, 1911, 1589, 1926,  370, 1229, 1295,
        967,  180,  354, 1763, 1343,  964,  633, 1173,  422,  778, 1573,
       1770, 1203, 1836,  150,  417,  336,  600, 1110, 1953,  466, 1217,
       1907,  238, 1237, 1407,  303, 1541, 1712,  139,  533], dtype=int64)

In [55]:
d = df.groupby('CustomerID')['PurchaseAmount'].sum()
d

CustomerID
1001     961
1002    1511
1003    1927
1004    1263
1005    1722
        ... 
1096     303
1097    1541
1098    1712
1099     139
1100     533
Name: PurchaseAmount, Length: 100, dtype: int64

In [57]:
#Can we compute the total spend per customer?
d = df.groupby('CustomerID')['PurchaseAmount'].sum()
d

CustomerID
1001     961
1002    1511
1003    1927
1004    1263
1005    1722
        ... 
1096     303
1097    1541
1098    1712
1099     139
1100     533
Name: PurchaseAmount, Length: 100, dtype: int64

In [58]:
#Which categories or locations contribute most to high-value transactions?
f = df.groupby('PurchaseAmount')['Category'].sum()
f

PurchaseAmount
139     Electronics
142     Electronics
150        Clothing
180       Furniture
194       Furniture
           ...     
1927       Clothing
1930       Clothing
1945      Furniture
1953       Clothing
1967      Furniture
Name: Category, Length: 97, dtype: object

In [59]:
df.columns

Index(['CustomerID', 'Age', 'Gender', 'PurchaseAmount', 'Category',
       'TransactionDate', 'Location', 'M', 'D', 'W'],
      dtype='object')

In [60]:
#Can we group customers into age groups (e.g., Young, Middle-aged, Senior)?
def ageg(Age):
    if Age < 30:
        return "Young"
    if Age >= 50:
        return "Middle"
    if Age > 50:
        return "Senior"

In [61]:
df['Group']= df['Age'].apply(ageg)

In [62]:
df.head(3)

Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,Category,TransactionDate,Location,M,D,W,Group
0,1001,23,Male,961,Electronics,2024-01-28,Houston,1,28,6,Young
1,1002,51,Female,1511,Furniture,2024-05-27,Chicago,5,27,0,Middle
2,1003,18,Male,1927,Clothing,2024-04-25,San Francisco,4,25,3,Young


In [63]:
#Can we flag transactions as high-value or low-value?
thres = df['PurchaseAmount'].mean()

In [66]:
df['flag'] = df['PurchaseAmount'].apply(lambda x:'High' if x > thres else 'Low')

In [67]:
df.head(3)

Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,Category,TransactionDate,Location,M,D,W,Group,flag
0,1001,23,Male,961,Electronics,2024-01-28,Houston,1,28,6,Young,Low
1,1002,51,Female,1511,Furniture,2024-05-27,Chicago,5,27,0,Middle,High
2,1003,18,Male,1927,Clothing,2024-04-25,San Francisco,4,25,3,Young,High


In [70]:
#How many transactions per customer (CustomerID) occurred?
g = df.groupby('CustomerID')['PurchaseAmount'].sum()
g

CustomerID
1001     961
1002    1511
1003    1927
1004    1263
1005    1722
        ... 
1096     303
1097    1541
1098    1712
1099     139
1100     533
Name: PurchaseAmount, Length: 100, dtype: int64

In [71]:
#Who are the high-value customers (e.g., PurchaseAmount > 1000)?
df[df['PurchaseAmount'] >1000]

Unnamed: 0,CustomerID,Age,Gender,PurchaseAmount,Category,TransactionDate,Location,M,D,W,Group,flag
1,1002,51,Female,1511,Furniture,2024-05-27,Chicago,5,27,0,Middle,High
2,1003,18,Male,1927,Clothing,2024-04-25,San Francisco,4,25,3,Young,High
3,1004,37,Male,1263,Clothing,2024-05-10,Houston,5,10,4,,High
4,1005,19,Male,1722,Furniture,2024-02-15,Houston,2,15,3,Young,High
6,1007,38,Male,1842,Furniture,2024-01-09,Los Angeles,1,9,1,,High
...,...,...,...,...,...,...,...,...,...,...,...,...
91,1092,52,Female,1907,Electronics,2024-03-11,San Francisco,3,11,0,Middle,High
93,1094,64,Male,1237,Groceries,2024-01-30,San Francisco,1,30,1,Middle,High
94,1095,46,Female,1407,Clothing,2024-02-18,New York,2,18,6,,High
96,1097,42,Female,1541,Furniture,2024-01-27,Los Angeles,1,27,5,,High


In [72]:
import pandas as pd 
import numpy as np

In [73]:
#How can you perform element-wise arithmetic operations on two NumPy arrays?
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])

In [74]:
#What does np.concatenate() do, and how is it used?
c = a + b
c

array([ 6,  8, 10, 12])

In [76]:
d = np.concatenate([a,b])
d

array([1, 2, 3, 4, 5, 6, 7, 8])

In [77]:
#What is broadcasting in NumPy, and how does it work?
a = np.array([1,2,3,4,5])
c = a + 10
c

array([11, 12, 13, 14, 15])

In [78]:
#How do you find the maximum and minimum values in a NumPy array?
f = np.array([11, 12, 13, 14, 15])
print(min(a))
print(max(a))

1
5


In [79]:
#What does np.sum(), np.mean(), and np.std() do?
f = np.array([11, 12, 13, 14, 15])
print(sum(a))
print(np.mean(a))
print(np.std(a))

15
3.0
1.4142135623730951


In [82]:
#How can you find indices of the elements that satisfy a given condition in a NumPy array?
a = np.array([11, 12, 13, 14, 15])
g = np.where(a>12)
g

(array([2, 3, 4], dtype=int64),)

In [83]:
#How do you calculate the dot product of two NumPy arrays?
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])
c = np.dot(a,b)
c

70

In [84]:
#What is the purpose of np.random and how can it be used to generate random numbers?
a = np.random.rand(10)
a

array([0.77684033, 0.92369166, 0.83897995, 0.20796977, 0.26738609,
       0.83248549, 0.45782283, 0.60168871, 0.32493457, 0.82797317])

In [87]:
#What is the difference between np.flatten() and np.ravel()?
a = np.array([0.77684033, 0.92369166, 0.83897995, 0.20796977, 0.26738609,
       0.83248549, 0.45782283, 0.60168871, 0.32493457, 0.82797317])

In [89]:
b = a.flatten()
b[0]=55

In [90]:
b

array([55.        ,  0.92369166,  0.83897995,  0.20796977,  0.26738609,
        0.83248549,  0.45782283,  0.60168871,  0.32493457,  0.82797317])

In [94]:
#How do you compute the inverse of a matrix using NumPy?
a = np.array([[1, 2], [3, 4]])
c = np.linalg.inv(a)
c

array([[-2. ,  1. ],
       [ 1.5, -0.5]])

In [96]:
#How can you extract a subarray from a NumPy array?
a = np.array([1, 2, 3, 4])
b = a[0:3]
b

array([1, 2, 3])

In [99]:
#What is the difference between np.copy() and assignment in NumPy arrays?
a = np.array([1, 2, 3, 4])
b = a
a[0]=100
print(a)
print(b)

[100   2   3   4]
[100   2   3   4]


In [101]:
a = np.array([1, 2, 3, 4])
b = np.copy(a)
b[0]=100
print(a)
print(b)

[1 2 3 4]
[100   2   3   4]


In [None]:
a = np.array([[1, 2, 3], [4, 5, 6]])
transposed_a = np.transpose(a)

In [108]:
a = np.array([[1, 2, 3], [3, 4, 5]])
c = np.transpose(a)
c

array([[1, 3],
       [2, 4],
       [3, 5]])

In [112]:
# How can you change the shape of a NumPy array without changing its data?
a = np.array([1,2,3,4,5,6,7,8,9,10]).reshape(5,2)

In [113]:
a

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10]])

In [114]:
#What is the use of np.where() and how is it applied in conditional statements?
a = np.array([1,2,3,4,5,6,7,8,9,10])
b = np.where(a>5)
b

(array([5, 6, 7, 8, 9], dtype=int64),)

In [115]:
#How can you check if two NumPy arrays are equal?
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])
c = np.array_equal(a,b)
c

False

In [116]:
#How can you stack two NumPy arrays vertically and horizontally?
a = np.array([1, 2, 3, 4])
b = np.vstack(a)
b

array([[1],
       [2],
       [3],
       [4]])

In [117]:
a = np.array([1, 2, 3, 4])
b = np.hstack(a)
b

array([1, 2, 3, 4])

In [119]:
#What is the purpose of np.linspace() and how is it different from np.arange()?
a = np.linspace(5,25, num=10)
a

array([ 5.        ,  7.22222222,  9.44444444, 11.66666667, 13.88888889,
       16.11111111, 18.33333333, 20.55555556, 22.77777778, 25.        ])

In [120]:
#How do you sort the elements of a NumPy array?
a = np.array([7,2,9,6,8,1,2])
b = np.sort(a)
b

array([1, 2, 2, 6, 7, 8, 9])

In [126]:
#How can you calculate the determinant of a matrix using NumPy?
a = np.array([[1, 2, 3], [0,4,5], [1,0,6]])
b = np.linalg.det(a)
b

22.000000000000004