In [53]:
# import the necessary libraries
import numpy as np
import pandas as pd
from scipy.stats import stats

## Question1- Pandas for Data Analysis

**a. Data loading, Datetime conversion and Feature Extraction**

In [2]:
# load the dataset
url = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/eletronic_sales.csv"
sales = pd.read_csv(url)


# display the first few rows of the dataset
sales.head()

Unnamed: 0,Date,Branch,Sales Agent,Products,Units,Price
0,2014-09-01,Woji,Chinedu,Apple,2,125.0
1,2015-06-17,Woji,Emeka,Apple,5,125.0
2,2015-09-10,Woji,Ibrahim,Lenovo,7,1.29
3,2015-11-17,Woji,Tolu,HP,11,4.99
4,2015-10-31,Woji,Tonye,Lenovo,14,1.29


** Datetime conversion**

In [3]:
# Convert the Date  column to datetime
date = pd.to_datetime(sales['Date'])

# Extract the year, month, and day from the Date column
sales['Year'] = date.dt.year
sales['Month'] = date.dt.month
sales['Day'] = date.dt.day_name()


# display the first few rows of the updated dataset
sales.head()

Unnamed: 0,Date,Branch,Sales Agent,Products,Units,Price,Year,Month,Day
0,2014-09-01,Woji,Chinedu,Apple,2,125.0,2014,9,Monday
1,2015-06-17,Woji,Emeka,Apple,5,125.0,2015,6,Wednesday
2,2015-09-10,Woji,Ibrahim,Lenovo,7,1.29,2015,9,Thursday
3,2015-11-17,Woji,Tolu,HP,11,4.99,2015,11,Tuesday
4,2015-10-31,Woji,Tonye,Lenovo,14,1.29,2015,10,Saturday


In [4]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         43 non-null     object 
 1   Branch       43 non-null     object 
 2   Sales Agent  43 non-null     object 
 3   Products     43 non-null     object 
 4   Units        43 non-null     int64  
 5   Price        43 non-null     float64
 6   Year         43 non-null     int32  
 7   Month        43 non-null     int32  
 8   Day          43 non-null     object 
dtypes: float64(1), int32(2), int64(1), object(5)
memory usage: 2.8+ KB


**b. Branch-Level Total Sales**

In [5]:
# Calculate the total sales for each branch, where total sales = units x price. Return a new Dataframe showing Branch and Total_Sales
sales["Total Sales"] = sales["Units"] * sales["Price"]

sales.columns

Index(['Date', 'Branch', 'Sales Agent', 'Products', 'Units', 'Price', 'Year',
       'Month', 'Day', 'Total Sales'],
      dtype='object')

In [6]:
# Renaming the Total Sales column to Total_Sales
sales = sales.rename(columns={'Total Sales': 'Total_Sales'})

sales.columns

Index(['Date', 'Branch', 'Sales Agent', 'Products', 'Units', 'Price', 'Year',
       'Month', 'Day', 'Total_Sales'],
      dtype='object')

In [7]:
sales.head()

Unnamed: 0,Date,Branch,Sales Agent,Products,Units,Price,Year,Month,Day,Total_Sales
0,2014-09-01,Woji,Chinedu,Apple,2,125.0,2014,9,Monday,250.0
1,2015-06-17,Woji,Emeka,Apple,5,125.0,2015,6,Wednesday,625.0
2,2015-09-10,Woji,Ibrahim,Lenovo,7,1.29,2015,9,Thursday,9.03
3,2015-11-17,Woji,Tolu,HP,11,4.99,2015,11,Tuesday,54.89
4,2015-10-31,Woji,Tonye,Lenovo,14,1.29,2015,10,Saturday,18.06


**c. Top performing Sales Agent**

In [8]:
# Determine the top-performing sales agent based on total sales across all branches. Display both the agent's name and their total sales amount.

agents = sales['Sales Agent'].unique()
sales.groupby('Sales Agent')['Total_Sales'].sum().head(1)



Sales Agent
Blessing    2363.04
Name: Total_Sales, dtype: float64

**d. Introducing and Filling Missing Values**

In [9]:
# Using Numpy, introduce missing values in the price column for rows 5, 15, and 25.After that, fill the missing values using the median of the price column.
sales.loc[[5, 15, 25],'Price' ] = np.nan


In [15]:
sales.loc[[5, 15, 25],'Price' ]

5    NaN
15   NaN
25   NaN
Name: Price, dtype: float64

In [23]:
# Fill the missing values using median of the price column
median_value = sales['Price'].median()
median_value

sales['Price'].fillna(median_value, inplace=True)

sales.isnull().sum()

Date           0
Branch         0
Sales Agent    0
Products       0
Units          0
Price          0
Year           0
Month          0
Day            0
Total_Sales    0
dtype: int64

**e. Product-Level Summary**

In [30]:
# Generate a summary DataFrame that shows each product's average price and total units sold
avg_price = pd.DataFrame({'Average_Price': sales.groupby("Products")['Total_Sales'].mean().round(2),})

avg_price

Unnamed: 0_level_0,Average_Price
Products,Unnamed: 1_level_1
Apple,566.67
Compaq,409.04
Dell,595.7
HP,638.51
Lenovo,164.24


## Question 2- Numpy for numeric computation

**a. Array Creation and Basic Manipulation**

In [31]:
# Create a NumPy array containing 20 random integers between 10 and 100. Then perform the following tasks:
array = np.random.randint(10, 100, size=20)
print(array)

[92 32 45 84 66 78 63 87 46 59 13 22 89 62 29 44 14 36 25 90]


In [33]:
# Reshape the array into a 4x5 matrix
reshaped = array.reshape(4,5)
print(reshaped)

[[92 32 45 84 66]
 [78 63 87 46 59]
 [13 22 89 62 29]
 [44 14 36 25 90]]


In [37]:
# Extract the first two rows and last three columns from the reshaped array.
extract= reshaped[0:2, -3:]
extract

array([[45, 84, 66],
       [87, 46, 59]], dtype=int32)

In [41]:
# Compute the mean and the standard deviation of the entire array
mean_array = array.mean()
std_array = array.std()
print(f"Mean: {mean_array}")
print(f"STD: {std_array:.2f}")

Mean: 53.8
STD: 26.14


**b. Operations on 2D Arrays**

In [44]:
# Simulate a 2D array representing student's scores in 5 subjects( 10 students).
student_scores = np.random.randint(50,100,(10,5))
student_scores

array([[96, 80, 79, 97, 74],
       [96, 64, 55, 51, 91],
       [83, 67, 98, 90, 86],
       [88, 69, 80, 60, 68],
       [80, 54, 93, 55, 63],
       [51, 59, 59, 89, 62],
       [65, 82, 73, 56, 51],
       [87, 69, 52, 90, 82],
       [95, 53, 99, 84, 96],
       [55, 91, 90, 69, 60]], dtype=int32)

In [45]:
# Calculate the average score per student
avg_student = student_scores.mean(axis=1)
avg_student

array([85.2, 71.4, 84.8, 73. , 69. , 64. , 65.4, 76. , 85.4, 73. ])

In [46]:
# Determine the highest and lower score in the dataset
max_score = student_scores.max()
min_score = student_scores.min()
print(f"Max score:{max_score}")
print(f"Min score:{min_score}")

Max score:99
Min score:51


**c. Working with 3D Arrays**


In [47]:
# Create a 3D NumPy array with dimensions(3,4,2), filled with random integers between 1 and 20. Perform the following:
array_3D = np. random.randint(1,20, (3,4,2))
array_3D

array([[[ 6, 10],
        [11, 16],
        [12, 12],
        [ 7,  2]],

       [[18, 18],
        [ 3, 11],
        [ 3,  8],
        [ 9,  4]],

       [[ 5,  6],
        [13, 14],
        [ 7, 10],
        [14,  1]]], dtype=int32)

In [None]:
#Find the sum of elements accross the second axis
array_3D =  

In [48]:
flatten = array_3D.flatten()
flatten

array([ 6, 10, 11, 16, 12, 12,  7,  2, 18, 18,  3, 11,  3,  8,  9,  4,  5,
        6, 13, 14,  7, 10, 14,  1], dtype=int32)

## Question3 - Statistics for statistical analysis

**a. Measures of Center and Spread**

In [56]:
# compute the mean, median and mode
countries = np.array([25.4, 30.2, 22.5, 28.1, 35.0])
# Mean
mean = np.mean(countries)
# Median
median = np.median(countries)
# Range
range = countries.max() - countries.min()
# Standard deviation
sd = np.std(countries)


print(f"Mean: {mean}")
print(f"Median: {median}")
print("There is no mode in the dataset" )
print(f"Rnage: {range}")
print(f"Standard deviation: {sd}")


Mean: 28.24
Median: 28.1
There is no mode in the dataset
Rnage: 12.5
Standard deviation: 4.256101502549017


**b. Hypothesis Testing**

In [None]:
#Two samples t- test
beef_arg = np.array([60, 62, 58, 63, 59])


beef_arg_mean = beef_arg.mean()
beef_arg_samp = np.random.normal(loc=beef_arg_mean, scale=5, size=30)
beef_arg_samp

array([48.49814497, 55.78055418, 58.955804  , 57.84215454, 59.42640024,
       61.13562747, 63.37859527, 54.13286436, 64.01522683, 58.6568089 ,
       63.23344161, 62.09549848, 59.46490731, 56.89220326, 59.45638298,
       42.68986084, 52.30214786, 69.50979453, 56.7213992 , 57.77205134,
       56.84874823, 54.97178854, 52.01210064, 51.79218124, 62.96955292,
       55.31802171, 62.79754401, 61.53780963, 57.97639342, 55.32583149])

In [58]:
beef_bang= np.array([15, 12, 18, 14, 16])
beef_bang_mean = beef_bang.mean()
beef_bang_samp = np.random.normal(loc=beef_bang_mean, scale=5, size=30)
beef_bang_samp

array([15.59041311, 23.30993441, 14.9655464 , 20.73166571, 15.04283583,
        8.36785081, 17.52909735, 17.36281964, 13.20337869,  9.72593157,
       11.5399203 , 14.7421091 , 10.97643122, 16.0535117 , 17.65532429,
        8.90547894,  5.01836716, 13.89835924, 19.18833573, 20.23586149,
       10.99900519, 13.23917664, 13.92002906, 21.22717114,  8.60613062,
       20.68035203, 13.28975535, 19.62408293, 20.64256598, 11.29808764])

In [60]:
t_stats,p_val = stats.ttest_ind(beef_arg_samp, beef_bang_samp)
print(f"T_statistics: {t_stats}")
print(f"P value: {p_val}")

T_statistics: 33.69786910953483
P value: 8.707120337468893e-40


  t_stats,p_val = stats.ttest_ind(beef_arg_samp, beef_bang_samp)


**Interpretation**
